) { char *yo = "Zoltan_PHG_rdivide"; int i, j, mid, ierr=ZOLTAN_OK, leftend, rightstart; int *pins[2] = {NULL,NULL}, *lpins[2] = {NULL,NULL}; Partition part=NULL; HGraph *left=NULL, *right=NULL; int *proclist=NULL, *sendbuf=NULL, *recvbuf=NULL, nsend, msg_tag=7777; PHGComm *hgc = hg->comm; int nVtx = hg->nVtx, gnVtx = hg->dist_x[hgc->nProc_x]; double leftw=0.0, rightw=0.0; float bal_tol = hgp->bal_tol; float *bisec_part_sizes=NULL; /* Target partition sizes; dimension is 2*hg->VtxWeightDim because we are doing bisection */ int part_dim = hg->VtxWeightDim ? hg->VtxWeightDim : 1; struct phg_timer_indices *timer = Zoltan_PHG_LB_Data_timers(zz); int do_timing = (hgp->use_timers > 1); int detail_timing = (hgp->use_timers > 3); Zoltan_PHG_Tree_Set(zz, father, lo, hi); if (!gnVtx) { /* UVC: no vertex; no need for recursion!? */ if (level>0) Zoltan_HG_HGraph_Free(hg); return ierr; } if (do_timing) { if (timer->rdrdivide < 0) timer->rdrdivide = Zoltan_Timer_Init(zz->ZTime, 1, "Rdivide");
/* Main partitioning function for hypergraph partitioning. */ int Zoltan_PHG_Partition ( ZZ *zz, /* Zoltan data structure */ HGraph *hg, /* Input hypergraph to be partitioned */ int p, /* Input: number partitions to be generated */ float *part_sizes, /* Input: array of length p containing percentages of work to be assigned to each partition */ Partition parts, /* Input: initial partition #s; aligned with vtx arrays. Output: computed partition #s */ PHGPartParams *hgp) /* Input: parameters for hgraph partitioning. */ { PHGComm *hgc = hg->comm; VCycle *vcycle=NULL, *del=NULL; int i, err = ZOLTAN_OK, middle; ZOLTAN_GNO_TYPE origVpincnt; /* for processor reduction test */ ZOLTAN_GNO_TYPE prevVcnt = 2*hg->dist_x[hgc->nProc_x]; /* initialized so that the */ ZOLTAN_GNO_TYPE prevVedgecnt = 2*hg->dist_y[hgc->nProc_y]; /* while loop will be entered before any coarsening */ ZOLTAN_GNO_TYPE tot_nPins, local_nPins; MPI_Datatype zoltan_gno_mpi_type; char *yo = "Zoltan_PHG_Partition"; int do_timing = (hgp->use_timers > 1); int fine_timing = (hgp->use_timers > 2); int vcycle_timing = (hgp->use_timers > 4 && hgp->ProRedL == 0); short refine = 0; struct phg_timer_indices *timer = Zoltan_PHG_LB_Data_timers(zz); int reset_geometric_matching = 0; char reset_geometric_string[4]; ZOLTAN_TRACE_ENTER(zz, yo); zoltan_gno_mpi_type = Zoltan_mpi_gno_type(); if (do_timing) { if (timer->vcycle < 0) timer->vcycle = Zoltan_Timer_Init(zz->ZTime, 0, "Vcycle"); if (timer->procred < 0) timer->procred = Zoltan_Timer_Init(zz->ZTime, 0, "Processor Reduction"); if (timer->match < 0) timer->match = Zoltan_Timer_Init(zz->ZTime, 1, "Matching"); if (timer->coarse < 0) timer->coarse = Zoltan_Timer_Init(zz->ZTime, 1, "Coarsening"); if (timer->coarsepart < 0) timer->coarsepart = Zoltan_Timer_Init(zz->ZTime, 1, "Coarse_Partition"); if (timer->refine < 0) timer->refine = Zoltan_Timer_Init(zz->ZTime, 1, "Refinement"); if (timer->project < 0) timer->project = Zoltan_Timer_Init(zz->ZTime, 1, "Project_Up"); ZOLTAN_TIMER_START(zz->ZTime, timer->vcycle, hgc->Communicator); } local_nPins = (ZOLTAN_GNO_TYPE)hg->nPins; MPI_Allreduce(&local_nPins,&tot_nPins,1,zoltan_gno_mpi_type,MPI_SUM,hgc->Communicator); origVpincnt = tot_nPins; if (!(vcycle = newVCycle(zz, hg, parts, NULL, vcycle_timing))) { ZOLTAN_PRINT_ERROR (zz->Proc, yo, "VCycle is NULL."); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_MEMERR; } /* For geometric coarsening, hgp->matching pointer and string are reset * after geometric_levels of coarsening. Will need to reset them after * this vcycle is completed. Capture that fact now! */ if (!strcasecmp(hgp->redm_str, "rcb") || !strcasecmp(hgp->redm_str, "rib")) { reset_geometric_matching = 1; strcpy(reset_geometric_string, hgp->redm_str); } /****** Coarsening ******/ #define COARSEN_FRACTION_LIMIT 0.9 /* Stop if we don't make much progress */ while ((hg->redl>0) && (hg->dist_x[hgc->nProc_x] > (ZOLTAN_GNO_TYPE)hg->redl) && ((hg->dist_x[hgc->nProc_x] < (ZOLTAN_GNO_TYPE) (COARSEN_FRACTION_LIMIT * prevVcnt + 0.5)) /* prevVcnt initialized to 2*hg->dist_x[hgc->nProc_x] */ || (hg->dist_y[hgc->nProc_y] < (ZOLTAN_GNO_TYPE) (COARSEN_FRACTION_LIMIT * prevVedgecnt + 0.5))) /* prevVedgecnt initialized to 2*hg->dist_y[hgc->nProc_y] */ && hg->dist_y[hgc->nProc_y] && hgp->matching) { ZOLTAN_GNO_TYPE *match = NULL; VCycle *coarser=NULL, *redistributed=NULL; prevVcnt = hg->dist_x[hgc->nProc_x]; prevVedgecnt = hg->dist_y[hgc->nProc_y]; #ifdef _DEBUG /* UVC: load balance stats */ Zoltan_PHG_LoadBalStat(zz, hg); #endif if (hgp->output_level >= PHG_DEBUG_LIST) { uprintf(hgc, "START %3d |V|=%6d |E|=%6d #pins=%6d %d/%s/%s/%s p=%d...\n", hg->info, hg->nVtx, hg->nEdge, hg->nPins, hg->redl, hgp->redm_str, hgp->coarsepartition_str, hgp->refinement_str, p); if (hgp->output_level > PHG_DEBUG_LIST) { err = Zoltan_HG_Info(zz, hg); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) goto End; } } if (hgp->output_level >= PHG_DEBUG_PLOT) Zoltan_PHG_Plot(zz->Proc, hg->nVtx, p, hg->vindex, hg->vedge, NULL, "coarsening plot"); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->vcycle, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->match, hgc->Communicator); } if (vcycle_timing) { if (vcycle->timer_match < 0) { char str[80]; sprintf(str, "VC Matching %d", hg->info); vcycle->timer_match = Zoltan_Timer_Init(vcycle->timer, 0, str); } ZOLTAN_TIMER_START(vcycle->timer, vcycle->timer_match, hgc->Communicator); } /* Allocate and initialize Matching Array */ if (hg->nVtx && !(match = (ZOLTAN_GNO_TYPE *) ZOLTAN_MALLOC (hg->nVtx*sizeof(ZOLTAN_GNO_TYPE)))) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Insufficient memory: Matching array"); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_MEMERR; } for (i = 0; i < hg->nVtx; i++) match[i] = i; /* Calculate matching (packing or grouping) */ err = Zoltan_PHG_Matching (zz, hg, match, hgp); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) { ZOLTAN_FREE (&match); goto End; } if (vcycle_timing) ZOLTAN_TIMER_STOP(vcycle->timer, vcycle->timer_match, hgc->Communicator); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->match, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->coarse, hgc->Communicator); } if (vcycle_timing) { if (vcycle->timer_coarse < 0) { char str[80]; sprintf(str, "VC Coarsening %d", hg->info); vcycle->timer_coarse = Zoltan_Timer_Init(vcycle->timer, 0, str); } ZOLTAN_TIMER_START(vcycle->timer, vcycle->timer_coarse, hgc->Communicator); } if (!(coarser = newVCycle(zz, NULL, NULL, vcycle, vcycle_timing))) { ZOLTAN_FREE (&match); ZOLTAN_PRINT_ERROR (zz->Proc, yo, "coarser is NULL."); goto End; } /* Construct coarse hypergraph and LevelMap */ err = Zoltan_PHG_Coarsening (zz, hg, match, coarser->hg, vcycle->LevelMap, &vcycle->LevelCnt, &vcycle->LevelSndCnt, &vcycle->LevelData, &vcycle->comm_plan, hgp); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) goto End; if (vcycle_timing) ZOLTAN_TIMER_STOP(vcycle->timer, vcycle->timer_coarse, hgc->Communicator); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->coarse, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->vcycle, hgc->Communicator); } ZOLTAN_FREE (&match); if ((err=allocVCycle(coarser))!= ZOLTAN_OK) goto End; vcycle = coarser; hg = vcycle->hg; if (hgc->nProc > 1 && hgp->ProRedL > 0) { local_nPins = (ZOLTAN_GNO_TYPE)hg->nPins; MPI_Allreduce(&local_nPins, &tot_nPins, 1, zoltan_gno_mpi_type, MPI_SUM, hgc->Communicator); if (tot_nPins < (ZOLTAN_GNO_TYPE)(hgp->ProRedL * origVpincnt + 0.5)) { if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->vcycle, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->procred, hgc->Communicator); } /* redistribute to half the processors */ origVpincnt = tot_nPins; /* update for processor reduction test */ if(hg->nVtx&&!(hg->vmap=(int*)ZOLTAN_MALLOC(hg->nVtx*sizeof(int)))) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Insufficient memory: hg->vmap"); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_MEMERR; } for (i = 0; i < hg->nVtx; i++) hg->vmap[i] = i; middle = (int)((float) (hgc->nProc-1) * hgp->ProRedL); if (hgp->nProc_x_req!=1&&hgp->nProc_y_req!=1) { /* Want 2D decomp */ if ((middle+1) > SMALL_PRIME && Zoltan_PHG_isPrime(middle+1)) --middle; /* if it was prime just use one less #procs (since it should be bigger than SMALL_PRIME it is safe to decrement) */ } if (!(hgc = (PHGComm*) ZOLTAN_MALLOC (sizeof(PHGComm)))) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Insufficient memory: PHGComm"); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_MEMERR; } if (!(redistributed=newVCycle(zz,NULL,NULL,vcycle,vcycle_timing))) { ZOLTAN_FREE (&hgc); ZOLTAN_PRINT_ERROR (zz->Proc, yo, "redistributed is NULL."); goto End; } Zoltan_PHG_Redistribute(zz,hgp,hg,0,middle,hgc, redistributed->hg, &vcycle->vlno,&vcycle->vdest); if (hgp->UseFixedVtx || hgp->UsePrefPart) redistributed->hg->bisec_split = hg->bisec_split; if ((err=allocVCycle(redistributed))!= ZOLTAN_OK) goto End; vcycle = redistributed; if (hgc->myProc < 0) /* I'm not in the redistributed part so I should go to uncoarsening refinement and wait */ { if (fine_timing) { if (timer->cpgather < 0) timer->cpgather = Zoltan_Timer_Init(zz->ZTime, 1, "CP Gather"); if (timer->cprefine < 0) timer->cprefine =Zoltan_Timer_Init(zz->ZTime, 0, "CP Refine"); if (timer->cpart < 0) timer->cpart = Zoltan_Timer_Init(zz->ZTime, 0, "CP Part"); } if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->procred, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->vcycle, hgc->Communicator); } goto Refine; } hg = vcycle->hg; hg->redl = hgp->redl; /* not set with hg creation */ if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->procred, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->vcycle, hgc->Communicator); } } } } if (hgp->output_level >= PHG_DEBUG_LIST) { uprintf(hgc, "START %3d |V|=%6d |E|=%6d #pins=%6d %d/%s/%s/%s p=%d...\n", hg->info, hg->nVtx, hg->nEdge, hg->nPins, hg->redl, hgp->redm_str, hgp->coarsepartition_str, hgp->refinement_str, p); if (hgp->output_level > PHG_DEBUG_LIST) { err = Zoltan_HG_Info(zz, hg); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) goto End; } } if (hgp->output_level >= PHG_DEBUG_PLOT) Zoltan_PHG_Plot(zz->Proc, hg->nVtx, p, hg->vindex, hg->vedge, NULL, "coarsening plot"); /* free array that may have been allocated in matching */ if (hgp->vtx_scal) { hgp->vtx_scal_size = 0; ZOLTAN_FREE(&(hgp->vtx_scal)); } if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->vcycle, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->coarsepart, hgc->Communicator); } /****** Coarse Partitioning ******/ err = Zoltan_PHG_CoarsePartition (zz, hg, p, part_sizes, vcycle->Part, hgp); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) goto End; if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->coarsepart, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->vcycle, hgc->Communicator); } Refine: del = vcycle; refine = 1; /****** Uncoarsening/Refinement ******/ while (vcycle) { VCycle *finer = vcycle->finer; hg = vcycle->hg; if (refine && hgc->myProc >= 0) { if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->vcycle, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->refine, hgc->Communicator); } if (vcycle_timing) { if (vcycle->timer_refine < 0) { char str[80]; sprintf(str, "VC Refinement %d", hg->info); vcycle->timer_refine = Zoltan_Timer_Init(vcycle->timer, 0, str); } ZOLTAN_TIMER_START(vcycle->timer, vcycle->timer_refine, hgc->Communicator); } err = Zoltan_PHG_Refinement (zz, hg, p, part_sizes, vcycle->Part, hgp); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->refine, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->vcycle, hgc->Communicator); } if (vcycle_timing) ZOLTAN_TIMER_STOP(vcycle->timer, vcycle->timer_refine, hgc->Communicator); if (hgp->output_level >= PHG_DEBUG_LIST) uprintf(hgc, "FINAL %3d |V|=%6d |E|=%6d #pins=%6d %d/%s/%s/%s p=%d bal=%.2f cutl=%.2f\n", hg->info, hg->nVtx, hg->nEdge, hg->nPins, hg->redl, hgp->redm_str, hgp->coarsepartition_str, hgp->refinement_str, p, Zoltan_PHG_Compute_Balance(zz, hg, part_sizes, 0, p, vcycle->Part), Zoltan_PHG_Compute_ConCut(hgc, hg, vcycle->Part, p, &err)); if (hgp->output_level >= PHG_DEBUG_PLOT) Zoltan_PHG_Plot(zz->Proc, hg->nVtx, p, hg->vindex, hg->vedge, vcycle->Part, "partitioned plot"); } if (finer) { int *rbuffer; /* Project coarse partition to fine partition */ if (finer->comm_plan) { refine = 1; if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->vcycle, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->project, hgc->Communicator); } if (vcycle_timing) { if (vcycle->timer_project < 0) { char str[80]; sprintf(str, "VC Project Up %d", hg->info); vcycle->timer_project = Zoltan_Timer_Init(vcycle->timer, 0, str); } ZOLTAN_TIMER_START(vcycle->timer, vcycle->timer_project, hgc->Communicator); } /* easy to assign partitions to internal matches */ for (i = 0; i < finer->hg->nVtx; i++) if (finer->LevelMap[i] >= 0) /* if considers only the local vertices */ finer->Part[i] = vcycle->Part[finer->LevelMap[i]]; /* now that the course partition assignments have been propagated */ /* upward to the finer level for the local vertices, we need to */ /* fill the LevelData (matched pairs of a local vertex with a */ /* off processor vertex) with the partition assignment of the */ /* local vertex - can be done totally in the finer level! */ for (i = 0; i < finer->LevelCnt; i++) { ++i; /* skip over off processor lno */ finer->LevelData[i] = finer->Part[finer->LevelData[i]]; } /* allocate rec buffer to exchange LevelData information */ rbuffer = NULL; if (finer->LevelSndCnt > 0) { rbuffer = (int*) ZOLTAN_MALLOC (2 * finer->LevelSndCnt * sizeof(int)); if (!rbuffer) { ZOLTAN_PRINT_ERROR (zz->Proc, yo, "Insufficient memory."); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_MEMERR; } } /* get partition assignments from owners of externally matched vtxs */ Zoltan_Comm_Resize (finer->comm_plan, NULL, COMM_TAG, &i); Zoltan_Comm_Do_Reverse (finer->comm_plan, COMM_TAG+1, (char*) finer->LevelData, 2 * sizeof(int), NULL, (char*) rbuffer); /* process data to assign partitions to expernal matches */ for (i = 0; i < 2 * finer->LevelSndCnt;) { int lno, partition; lno = rbuffer[i++]; partition = rbuffer[i++]; finer->Part[lno] = partition; } ZOLTAN_FREE (&rbuffer); Zoltan_Comm_Destroy (&finer->comm_plan); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->project, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->vcycle, hgc->Communicator); } if (vcycle_timing) ZOLTAN_TIMER_STOP(vcycle->timer, vcycle->timer_project, hgc->Communicator); } else { int *sendbuf = NULL, size; refine = 0; /* ints local and partition numbers */ if (finer->vlno) { sendbuf = (int*) ZOLTAN_MALLOC (2 * hg->nVtx * sizeof(int)); if (!sendbuf) { ZOLTAN_PRINT_ERROR (zz->Proc, yo, "Insufficient memory."); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_MEMERR; } for (i = 0; i < hg->nVtx; ++i) { sendbuf[2 * i] = finer->vlno[i]; /* assign local numbers */ sendbuf[2 * i + 1] = vcycle->Part[i];/* assign partition numbers */ } } ZOLTAN_FREE (&hgc); hgc = finer->hg->comm; /* updating hgc is required when the processors change */ /* Create comm plan to unredistributed processors */ err = Zoltan_Comm_Create(&finer->comm_plan, finer->vlno ? hg->nVtx : 0, finer->vdest, hgc->Communicator, COMM_TAG+2, &size); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) { ZOLTAN_PRINT_ERROR(hgc->myProc, yo, "Zoltan_Comm_Create failed."); goto End; } /* allocate rec buffer to exchange sendbuf information */ rbuffer = NULL; if (finer->hg->nVtx) { rbuffer = (int*) ZOLTAN_MALLOC (2 * finer->hg->nVtx * sizeof(int)); if (!rbuffer) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Insufficient memory."); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_MEMERR; } } /* Use plan to send partitions to the unredistributed processors */ Zoltan_Comm_Do(finer->comm_plan, COMM_TAG+3, (char *) sendbuf, 2*sizeof(int), (char *) rbuffer); MPI_Bcast(rbuffer, 2*finer->hg->nVtx, MPI_INT, 0, hgc->col_comm); /* process data to assign partitions to unredistributed processors */ for (i = 0; i < 2 * finer->hg->nVtx;) { int lno, partition; lno = rbuffer[i++]; partition = rbuffer[i++]; finer->Part[lno] = partition; } if (finer->vlno) ZOLTAN_FREE (&sendbuf); ZOLTAN_FREE (&rbuffer); Zoltan_Comm_Destroy (&finer->comm_plan); } } vcycle = finer; } /* while (vcycle) */ End: vcycle = del; while (vcycle) { if (vcycle_timing) { Zoltan_Timer_PrintAll(vcycle->timer, 0, hgc->Communicator, stdout); Zoltan_Timer_Destroy(&vcycle->timer); } if (vcycle->finer) { /* cleanup by level */ Zoltan_HG_HGraph_Free (vcycle->hg); if (vcycle->LevelData) Zoltan_Multifree (__FILE__, __LINE__, 4, &vcycle->Part, &vcycle->LevelMap, &vcycle->LevelData, &vcycle->hg); else if (vcycle->vlno) Zoltan_Multifree (__FILE__, __LINE__, 5, &vcycle->Part, &vcycle->vdest, &vcycle->vlno, &vcycle->LevelMap, &vcycle->hg); else Zoltan_Multifree (__FILE__, __LINE__, 3, &vcycle->Part, &vcycle->LevelMap, &vcycle->hg); } else /* cleanup top level */ Zoltan_Multifree (__FILE__, __LINE__, 2, &vcycle->LevelMap, &vcycle->LevelData); del = vcycle; vcycle = vcycle->finer; ZOLTAN_FREE(&del); } if (reset_geometric_matching) { strcpy(hgp->redm_str, reset_geometric_string); Zoltan_PHG_Set_Matching_Fn(hgp); } if (do_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer->vcycle, hgc->Communicator); ZOLTAN_TRACE_EXIT(zz, yo) ; return err; }
static int refine_fm2 (ZZ *zz, HGraph *hg, int p, float *part_sizes, Partition part, PHGPartParams *hgp, float bal_tol ) { int i, j, ierr=ZOLTAN_OK, *pins[2]={NULL,NULL}, *lpins[2]={NULL,NULL}; int *moves=NULL, *mark=NULL, *adj=NULL, passcnt=0; float *gain=NULL, *lgain=NULL; int best_cutsizeat, cont, successivefails=0; double total_weight, weights[2], total_lweight, lweights[2], lwadjust[2], max_weight[2], lmax_weight[2], avail[2], gavail[2]; int availcnt[2], gavailcnt[2]; double targetw0, ltargetw0, minvw=DBL_MAX; double cutsize, best_cutsize, best_limbal, imbal, limbal; HEAP heap[2]; char *yo="refine_fm2"; int part_dim = (hg->VtxWeightDim ? hg->VtxWeightDim : 1); #ifdef HANDLE_ISOLATED_VERTICES int isocnt=hg->nVtx; /* only root uses isocnt, isolated vertices are kept at the end of moves array */ int *deg=NULL, *ldeg=NULL; #if 0 double best_imbal; #endif #endif PHGComm *hgc=hg->comm; int rootRank; struct phg_timer_indices *timer = Zoltan_PHG_LB_Data_timers(zz); int do_timing = (hgp->use_timers > 2); int detail_timing = (hgp->use_timers > 3); ZOLTAN_TRACE_ENTER(zz, yo); if (p != 2) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "p!=2 not allowed for refine_fm2."); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_FATAL; } /* return only if globally there is no edge or vertex */ if (!hg->dist_y[hgc->nProc_y] || hg->dist_x[hgc->nProc_x] == 0) { ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_OK; } #ifdef USE_SERIAL_REFINEMENT_ON_ONE_PROC if (hgc->nProc==1){ /* only one proc? use serial code */ ZOLTAN_TRACE_EXIT(zz, yo); return serial_fm2 (zz, hg, p, part_sizes, part, hgp, bal_tol); } #endif if (do_timing) { if (timer->rfrefine < 0) timer->rfrefine = Zoltan_Timer_Init(zz->ZTime, 1, "Ref_P_Total"); ZOLTAN_TIMER_START(zz->ZTime, timer->rfrefine, hgc->Communicator); } if (detail_timing) { if (timer->rfpins < 0) timer->rfpins = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Pins"); if (timer->rfiso < 0) timer->rfiso = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_IsolatedVert"); if (timer->rfgain < 0) timer->rfgain = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Gain"); if (timer->rfheap < 0) timer->rfheap = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Heap"); if (timer->rfpass < 0) timer->rfpass = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Pass"); if (timer->rfroll < 0) timer->rfroll = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_Roll"); if (timer->rfnonroot < 0) timer->rfnonroot = Zoltan_Timer_Init(zz->ZTime, 0, "Ref_P_NonRoot"); } /* find the index of the proc in column group with the most #nonzeros; it will be our root proc for computing moves since it has better knowedge about global hypergraph. We ignore returned #pins (i) in root */ Zoltan_PHG_Find_Root(hg->nPins, hgc->myProc_y, hgc->col_comm, &i, &rootRank); /* Calculate the weights in each partition and total, then maxima */ weights[0] = weights[1] = 0.0; lweights[0] = lweights[1] = 0.0; if (hg->vwgt) for (i = 0; i < hg->nVtx; i++) { lweights[part[i]] += hg->vwgt[i*hg->VtxWeightDim]; minvw = (minvw > hg->vwgt[i*hg->VtxWeightDim]) ? hg->vwgt[i*hg->VtxWeightDim] : minvw; } else { minvw = 1.0; for (i = 0; i < hg->nVtx; i++) lweights[part[i]] += 1.0; } MPI_Allreduce(lweights, weights, 2, MPI_DOUBLE, MPI_SUM, hgc->row_comm); total_weight = weights[0] + weights[1]; targetw0 = total_weight * part_sizes[0]; /* global target weight for part 0 */ max_weight[0] = total_weight * bal_tol * part_sizes[0]; max_weight[1] = total_weight * bal_tol * part_sizes[part_dim]; /* should be (1 - part_sizes[0]) */ if (weights[0]==0.0) { ltargetw0 = targetw0 / hgc->nProc_x; lmax_weight[0] = max_weight[0] / hgc->nProc_x; } else { lmax_weight[0] = (weights[0]==0.0) ? 0.0 : lweights[0] + (max_weight[0] - weights[0]) * ( lweights[0] / weights[0] ); ltargetw0 = targetw0 * ( lweights[0] / weights[0] ); /* local target weight */ } if (weights[1]==0.0) lmax_weight[1] = max_weight[1] / hgc->nProc_x; else lmax_weight[1] = (weights[1]==0.0) ? 0.0 : lweights[1] + (max_weight[1] - weights[1]) * ( lweights[1] / weights[1] ); total_lweight = lweights[0]+lweights[1]; avail[0] = MAX(0.0, lmax_weight[0]-total_lweight); avail[1] = MAX(0.0, lmax_weight[1]-total_lweight); availcnt[0] = (avail[0] == 0) ? 1 : 0; availcnt[1] = (avail[1] == 0) ? 1 : 0; MPI_Allreduce(avail, gavail, 2, MPI_DOUBLE, MPI_SUM, hgc->row_comm); MPI_Allreduce(availcnt, gavailcnt, 2, MPI_INT, MPI_SUM, hgc->row_comm); #ifdef _DEBUG if (gavailcnt[0] || gavailcnt[1]) uprintf(hgc, "before adjustment, LMW[%.1lf, %.1lf]\n", lmax_weight[0], lmax_weight[1]); #endif if (gavailcnt[0]) lmax_weight[0] += gavail[0] / (double) gavailcnt[0]; if (gavailcnt[1]) lmax_weight[1] += gavail[1] / (double) gavailcnt[1]; /* Our strategy is to stay close to the current local weight balance. We do not need the same local balance on each proc, as long as we achieve approximate global balance. */ #ifdef _DEBUG imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0; limbal = (ltargetw0==0.0) ? 0.0 : fabs(lweights[0]-ltargetw0)/ltargetw0; uprintf(hgc, "H(%d, %d, %d), FM2: W[%.1lf, %.1lf] MW:[%.1lf, %.1lf] I=%.3lf LW[%.1lf, %.1lf] LMW[%.1lf, %.1lf] LI=%.3lf\n", hg->nVtx, hg->nEdge, hg->nPins, weights[0], weights[1], max_weight[0], max_weight[1], imbal, lweights[0], lweights[1], lmax_weight[0], lmax_weight[1], limbal); #endif if ((hg->nEdge && (!(pins[0] = (int*) ZOLTAN_MALLOC(2 * hg->nEdge * sizeof(int))) || !(lpins[0] = (int*) ZOLTAN_CALLOC(2 * hg->nEdge, sizeof(int))))) || (hg->nVtx && (!(moves = (int*) ZOLTAN_MALLOC(hg->nVtx * sizeof(int))) || !(lgain = (float*) ZOLTAN_MALLOC(hg->nVtx * sizeof(float)))))) MEMORY_ERROR; if (hg->nEdge) { pins[1] = &(pins[0][hg->nEdge]); lpins[1] = &(lpins[0][hg->nEdge]); } if (hgc->myProc_y==rootRank) { /* only root needs mark, adj, gain and heaps*/ Zoltan_Heap_Init(zz, &heap[0], hg->nVtx); Zoltan_Heap_Init(zz, &heap[1], hg->nVtx); if (hg->nVtx && (!(mark = (int*) ZOLTAN_CALLOC(hg->nVtx, sizeof(int))) || !(adj = (int*) ZOLTAN_MALLOC(hg->nVtx * sizeof(int))) || !(gain = (float*) ZOLTAN_MALLOC(hg->nVtx * sizeof(float))))) MEMORY_ERROR; } /* Initial calculation of the local pin distribution (sigma in UVC's papers) */ if (detail_timing) ZOLTAN_TIMER_START(zz->ZTime, timer->rfpins, hgc->Communicator); for (i = 0; i < hg->nEdge; ++i) for (j = hg->hindex[i]; j < hg->hindex[i+1]; ++j){ ++(lpins[part[hg->hvertex[j]]][i]); } if (detail_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfpins, hgc->Communicator); #ifdef HANDLE_ISOLATED_VERTICES /* first compute vertex degree to find any isolated vertices we use lgain and gain, as ldeg, deg.*/ if (hg->nVtx) { if (detail_timing) ZOLTAN_TIMER_START(zz->ZTime, timer->rfiso, hgc->Communicator); ldeg = (int *) lgain; deg = (int *) gain; /* null for non-root but that is fine */ for (i = 0; i < hg->nVtx; ++i) ldeg[i] = hg->vindex[i+1] - hg->vindex[i]; MPI_Reduce(ldeg, deg, hg->nVtx, MPI_INT, MPI_SUM, rootRank, hg->comm->col_comm); if (hgc->myProc_y==rootRank) { /* root marks isolated vertices */ for (i=0; i<hg->nVtx; ++i) if (!hgp->UseFixedVtx || hg->fixed_part[i]<0) { if (!deg[i]) { moves[--isocnt] = i; part[i] = -(part[i]+1); /* remove those vertices from that part*/ } } } if (detail_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfiso, hgc->Communicator); } #endif do { int v=1, movecnt=0, neggaincnt=0, from, to; int maxneggain = (hgp->fm_max_neg_move < 0) ? hg->nVtx : hgp->fm_max_neg_move; int notfeasible=(weights[0]>max_weight[0]) || (weights[1]>max_weight[1]); /* now compute global pin distribution */ if (hg->nEdge) { if (detail_timing) ZOLTAN_TIMER_START(zz->ZTime, timer->rfpins, hgc->Communicator); MPI_Allreduce(lpins[0], pins[0], 2*hg->nEdge, MPI_INT, MPI_SUM, hgc->row_comm); if (detail_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfpins, hgc->Communicator); } /* now we can compute actual cut */ best_cutsizeat=0; cutsize = 0.0; for (i=0; i < hg->nEdge; ++i) { if (pins[0][i] && pins[1][i]) cutsize += (hg->ewgt ? hg->ewgt[i] : 1.0); } MPI_Allreduce(&cutsize, &best_cutsize, 1, MPI_DOUBLE, MPI_SUM, hgc->col_comm); cutsize = best_cutsize; imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0; best_limbal = limbal = (ltargetw0==0.0) ? 0.0 : fabs(lweights[0]-ltargetw0)/ltargetw0; /* UVCUVC: it looks like instead of moving always from overloaded part, alternating the 'from' part gives better results. Hence if the imbal is not really bad (2x worse) we use that approach */ if (imbal > BADBALANCE*(bal_tol-1.0) ) /* decide which way the moves will be in this pass */ from = (weights[0] < targetw0) ? 1 : 0; else from = passcnt % 2; /* we want to be sure that everybody!!! picks the same source */ MPI_Bcast(&from, 1, MPI_INT, 0, hgc->Communicator); to = 1-from; #ifdef _DEBUG /* Just for debugging */ best_cutsize = Zoltan_PHG_Compute_NetCut(hgc, hg, part); if (best_cutsize!=cutsize) { errexit("%s: Initial cutsize=%.2lf Verify: total=%.2lf\n", uMe(hgc), cutsize, best_cutsize); } if (hgc->myProc_y==rootRank) for (i = 0; i< hg->nVtx; ++i) if (mark[i]) errexit("mark[%d]=%d", i, mark[i]); /* debuggging code ends here */ #endif /* compute only the gains of the vertices from 'from' part */ if (detail_timing) ZOLTAN_TIMER_START(zz->ZTime, timer->rfgain, hgc->Communicator); for (i = 0; i < hg->nVtx; ++i) { lgain[i] = 0.0; if ((part[i]==from) && (!hgp->UseFixedVtx || hg->fixed_part[i]<0)) for (j = hg->vindex[i]; j < hg->vindex[i+1]; j++) { int edge = hg->vedge[j]; if ((pins[0][edge]+pins[1][edge])>1) { /* if they have at least 2 pins :) */ if (pins[part[i]][edge] == 1) lgain[i] += (hg->ewgt ? hg->ewgt[edge] : 1.0); else if (pins[1-part[i]][edge] == 0) lgain[i] -= (hg->ewgt ? hg->ewgt[edge] : 1.0); } } } /* now sum up all gains on only root proc */ if (hg->nVtx) MPI_Reduce(lgain, gain, hg->nVtx, MPI_FLOAT, MPI_SUM, rootRank, hgc->col_comm); if (detail_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfgain, hgc->Communicator); if (hgp->output_level >= PHG_DEBUG_ALL) { imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0; printf("%s FM Pass %d (%d->%d) Cut=%.2f W[%5.0f, %5.0f] I= %.2f LW[%5.0f, %5.0f] LI= %.2f\n", uMe(hgc), passcnt, from, to, cutsize, weights[0], weights[1], imbal, lweights[0], lweights[1], limbal); } if (hgc->myProc_y==rootRank) { /* those are the lucky ones; each proc in column-group could have compute the same moves concurrently; but for this version we'll do it in the root procs and broadcast */ #ifdef HANDLE_ISOLATED_VERTICES if (detail_timing) ZOLTAN_TIMER_START(zz->ZTime, timer->rfiso, hgc->Communicator); lwadjust[0] = lwadjust[1] = 0.0; for (i=isocnt; i < hg->nVtx; ++i) { /* go over isolated vertices */ int u=moves[i], pno=-part[u]-1; float w=(hg->vwgt ? hg->vwgt[u*hg->VtxWeightDim] : 1.0); if (pno<0 || pno>1) errexit("heeeey pno=%d", pno); /* let's remove it from its part */ lwadjust[pno] -= w; } lweights[0] += lwadjust[0]; lweights[1] += lwadjust[1]; if (detail_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfiso, hgc->Communicator); #endif if (detail_timing) ZOLTAN_TIMER_START(zz->ZTime, timer->rfheap, hgc->Communicator); /* Initialize the heaps and fill them with the gain values */ Zoltan_Heap_Clear(&heap[from]); for (i = 0; i < hg->nVtx; ++i) if ((part[i]==from) && (!hgp->UseFixedVtx || hg->fixed_part[i]<0)) Zoltan_Heap_Input(&heap[from], i, gain[i]); Zoltan_Heap_Make(&heap[from]); if (detail_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfheap, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->rfpass, hgc->Communicator); } while ((neggaincnt < maxneggain) && ((lweights[to]+minvw) <= lmax_weight[to]) ) { if (Zoltan_Heap_Empty(&heap[from])) { /* too bad it is empty */ v = -1; break; } v = Zoltan_Heap_Extract_Max(&heap[from]); #ifdef _DEBUG if (from != part[v]) errexit("hooop from=%d part[%d]=%d", from, v, part[v]); #endif /* Mark vertex we picked from the heap so it is "locked". For the current strategy, moving only one direction at a time, the mark information is not critical. Note that the mark array is also used in the move/update routine so don't remove it! */ ++mark[v]; if (lweights[to]+((hg->vwgt)?hg->vwgt[v*hg->VtxWeightDim]:1.0) > lmax_weight[to]) { #ifdef _DEBUG2 printf("%s %4d: %6d (g: %5.1lf), p:%2d [%4.0lf, %4.0lf] NF\n", uMe(hgc), movecnt, v, gain[v], from, weights[0], weights[1]); #endif /* Negative value in moves array means we have examined the vertex but couldn't move it. Note offset by one, otherwise zero would be ambiguous. */ moves[movecnt++] = -(v+1); continue; } moves[movecnt] = v; ++neggaincnt; cutsize -= gain[v]; fm2_move_vertex_oneway(v, hg, part, gain, heap, pins, lpins, weights, lweights, mark, adj); imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0; limbal = (ltargetw0==0.0) ? 0.0 : fabs(lweights[0]-ltargetw0)/ltargetw0; if (notfeasible || (cutsize<best_cutsize) || (cutsize==best_cutsize && limbal < best_limbal)) { #ifdef _DEBUG2 printf("%s %4d: %6d (g: %5.1lf), p:%2d W[%4.0lf, %4.0lf] I:%.2lf LW[%4.0lf, %4.0lf] LI:%.2lf C:%.1lf<-- Best\n", uMe(hgc), movecnt, v, gain[v], from, weights[0], weights[1], imbal, lweights[0], lweights[1], limbal, cutsize); /* after move gain is -oldgain */ #endif notfeasible = weights[from]>max_weight[from]; best_cutsize = cutsize; best_cutsizeat = movecnt+1; best_limbal = limbal; neggaincnt = 0; } #ifdef _DEBUG2 else printf("%s %4d: %6d (g: %5.1lf), p:%2d [%4.0lf, %4.0lf] %.1lf\n", uMe(hgc), movecnt, v, gain[v], from, weights[0], weights[1], cutsize); #endif ++movecnt; } if (detail_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfpass, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->rfroll, hgc->Communicator); } #ifdef _DEBUG if (v<0) uprintf(hgc, "EOLB @ %d there was no vertex to select: v=%d\n", movecnt, v); else if (neggaincnt >= maxneggain) uprintf(hgc, "EOLB @ %d max neg move reached neggaincnt(%d) >= maxneggain\n", movecnt, neggaincnt, maxneggain); else uprintf(hgc, "EOLB @ %d balance constraint LW[%.1lf, %.1lf] and MAXW[%.1lf, %.1lf]\n", movecnt, lweights[0], lweights[1], lmax_weight[0], lmax_weight[1]); #endif /* roll back the moves without any improvement */ for (i=movecnt-1; i>=best_cutsizeat; --i) { int vv = moves[i]; if (vv<0) vv = -vv-1; else /* we don't need to roll pins, or weights etc; rolling local ones suffices */ fm2_move_vertex_oneway_nonroot(vv, hg, part, lpins, lweights); mark[vv] = 0; } for (i=0; i<best_cutsizeat; ++i){ int vv = (moves[i] < 0 ) ? -moves[i] - 1 : moves[i]; mark[vv] = 0; } if (detail_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfroll, hgc->Communicator); } if (detail_timing) ZOLTAN_TIMER_START(zz->ZTime, timer->rfnonroot, hgc->Communicator); /* now root bcast moves to column procs */ MPI_Bcast(&best_cutsizeat, 1, MPI_INT, rootRank, hgc->col_comm); MPI_Bcast(moves, best_cutsizeat, MPI_INT, rootRank, hgc->col_comm); if (hgc->myProc_y!=rootRank) { /* now non-root does move simulation */ for (i=0; i<best_cutsizeat; ++i) { int vv = moves[i]; if (vv>=0) fm2_move_vertex_oneway_nonroot(vv, hg, part, lpins, lweights); } } if (detail_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfnonroot, hgc->Communicator); #ifdef _DEBUG for (i = 0; i < hg->nEdge; ++i) { int lp[2]; lp[0] = lp[1] = 0; for (j = hg->hindex[i]; j < hg->hindex[i+1]; ++j) ++(lp[part[hg->hvertex[j]]]); if ((lp[0] != lpins[0][i]) || (lp[1] != lpins[1][i])) errexit("for net %d -- lp=[%d, %d] lpins[%d, %d]", i, lp[0], lp[1], lpins[0][i], lpins[1][i]); } #endif #ifdef HANDLE_ISOLATED_VERTICES if (detail_timing) ZOLTAN_TIMER_START(zz->ZTime, timer->rfiso, hgc->Communicator); #if 0 MPI_Allreduce(lweights, weights, 2, MPI_DOUBLE, MPI_SUM, hgc->row_comm); best_imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0; if (hgc->myProc_y==rootRank) uprintf(hgc, "BEFORE ISOLATED VERTEX HANDLING WE *THINK* GLOBAL IMBALANCE is %.3lf\n", best_imbal); #endif if (hgc->myProc_y==rootRank) { best_limbal = (ltargetw0==0.0) ? 0.0 : fabs(lweights[0]-ltargetw0)/ltargetw0; for (i=isocnt; i < hg->nVtx; ++i) { /* go over isolated vertices */ int u = moves[i], npno; float w=(hg->vwgt ? hg->vwgt[u*hg->VtxWeightDim] : 1.0); npno = (lweights[0] < ltargetw0) ? 0 : 1; lweights[npno] += w; lwadjust[npno] += w; part[u] = -(npno+1); /* move to npno (might be same as pno; so it may not be a real move */ } limbal = (ltargetw0==0.0) ? 0.0 : fabs(lweights[0]-ltargetw0)/ltargetw0; #if 0 uprintf(hgc, "before binpacking of %d isolated vertices balance was: %.3lf now: %.3lf\n", hg->nVtx-isocnt, best_limbal, limbal); #endif } MPI_Bcast(lwadjust, 2, MPI_DOUBLE, rootRank, hgc->col_comm); if (hgc->myProc_y!=rootRank) { lweights[0] += lwadjust[0]; lweights[1] += lwadjust[1]; } if (detail_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfiso, hgc->Communicator); #endif MPI_Allreduce(lweights, weights, 2, MPI_DOUBLE, MPI_SUM, hgc->row_comm); #if 0 best_imbal = (targetw0==0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0; if (hgc->myProc_y==rootRank) uprintf(hgc, "NEW GLOBAL IMBALANCE is %.3lf\n", best_imbal); #endif if (weights[0]==0.0) ltargetw0 = lmax_weight[0] = 0.0; else { lmax_weight[0] = lweights[0] + (max_weight[0] - weights[0]) * ( lweights[0] / weights[0] ); ltargetw0 = targetw0 * ( lweights[0] / weights[0] ); /* local target weight */ } lmax_weight[1] = (weights[1]==0.0) ? 0.0 : lweights[1] + (max_weight[1] - weights[1]) * ( lweights[1] / weights[1] ); cont = 0; MPI_Allreduce(&best_cutsizeat, &cont, 1, MPI_INT, MPI_LOR, hgc->row_comm); /* since we're only moving in one direction; make sure two successive pass didn't produce any improvement before terminating */ if (!cont) ++successivefails; else successivefails = 0; #ifdef _DEBUG /* Just for debugging */ best_cutsize = Zoltan_PHG_Compute_NetCut(hgc, hg, part); imbal = (targetw0 == 0.0) ? 0.0 : fabs(weights[0]-targetw0)/targetw0; printf("%s End of Pass %d Comp.Cut=%.2lf RealCut=%.2lf W[%5.0lf, %5.0lf] Imbal=%.2lf\n", uMe(hgc), passcnt, cutsize, best_cutsize, weights[0], weights[1], imbal); /* debuggging code ends here */ #endif } while (successivefails<2 && (++passcnt < hgp->fm_loop_limit)); #ifdef HANDLE_ISOLATED_VERTICES if (detail_timing) ZOLTAN_TIMER_START(zz->ZTime, timer->rfiso, hgc->Communicator); /* now root sneds the final part no's of isolated vertices; if any */ MPI_Bcast(&isocnt, 1, MPI_INT, rootRank, hgc->col_comm); if (isocnt<hg->nVtx) { deg = (int *) lgain; /* we'll use for part no's of isolated vertices */ if (hgc->myProc_y==rootRank) for (i=isocnt; i < hg->nVtx; ++i) { /* go over isolated vertices */ int u = moves[i]; deg[i] = part[u] = -part[u]-1; } MPI_Bcast(&moves[isocnt], hg->nVtx-isocnt, MPI_INT, rootRank, hgc->col_comm); MPI_Bcast(°[isocnt], hg->nVtx-isocnt, MPI_INT, rootRank, hgc->col_comm); if (hgc->myProc_y!=rootRank) for (i=isocnt; i < hg->nVtx; ++i) /* go over isolated vertices */ part[moves[i]] = deg[i]; } if (detail_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfiso, hgc->Communicator); #endif End: if (hgc->myProc_y==rootRank) { /* only root needs mark, adj, gain and heaps*/ Zoltan_Multifree(__FILE__,__LINE__, 3, &mark, &adj, &gain); Zoltan_Heap_Free(&heap[0]); Zoltan_Heap_Free(&heap[1]); } Zoltan_Multifree(__FILE__, __LINE__, 4, &pins[0], &lpins[0], &moves, &lgain); if (do_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer->rfrefine, hgc->Communicator); ZOLTAN_TRACE_EXIT(zz, yo); return ierr; }
int Zoltan_PHG_CoarsePartition( ZZ *zz, HGraph *phg, /* Input: coarse hypergraph -- distributed! */ int numPart, /* Input: number of partitions to generate. */ float *part_sizes, /* Input: array of size numPart listing target sizes (% of work) for the partitions */ Partition part, /* Input: array of initial partition assignments. Output: array of computed partition assignments. */ PHGPartParams *hgp /* Input: parameters to use. */ ) { /* * Zoltan_PHG_CoarsePartition computes a partitioning of a hypergraph. * Typically, this routine is called at the bottom level in a * multilevel scheme (V-cycle). * It gathers the distributed hypergraph to each processor and computes * a decomposition of the serial hypergraph. * It computes a different partition on each processor * using different random numbers (and possibly also * different algorithms) and selects the best. */ char *yo = "Zoltan_PHG_CoarsePartition"; int ierr = ZOLTAN_OK; int i, si, j; static PHGComm scomm; /* Serial communicator info */ static int first_time = 1; HGraph *shg = NULL; /* Serial hypergraph gathered from phg */ int *spart = NULL; /* Partition vectors for shg. */ int *new_part = NULL; /* Ptr to new partition vector. */ float *bestvals = NULL; /* Best cut values found so far */ int worst, new_cand; float bal, cut, worst_cut; int fine_timing = (hgp->use_timers > 2); struct phg_timer_indices *timer = Zoltan_PHG_LB_Data_timers(zz); int local_coarse_part = hgp->LocalCoarsePartition; /* Number of iterations to try coarse partitioning on each proc. */ /* 10 when p=1, and 1 when p is large. */ const int num_coarse_iter = 1 + 9/zz->Num_Proc; ZOLTAN_TRACE_ENTER(zz, yo); if (fine_timing) { if (timer->cpgather < 0) timer->cpgather = Zoltan_Timer_Init(zz->ZTime, 1, "CP Gather"); if (timer->cprefine < 0) timer->cprefine = Zoltan_Timer_Init(zz->ZTime, 0, "CP Refine"); if (timer->cpart < 0) timer->cpart = Zoltan_Timer_Init(zz->ZTime, 0, "CP Part"); ZOLTAN_TIMER_START(zz->ZTime, timer->cpart, phg->comm->Communicator); } /* Force LocalCoarsePartition if large global graph */ #define LARGE_GRAPH_VTX 64000 #define LARGE_GRAPH_PINS 256000 if (phg->dist_x[phg->comm->nProc_x] > LARGE_GRAPH_VTX){ /* TODO: || (global_nPins > LARGE_GRAPH_PINS) */ local_coarse_part = 1; } /* take care of all special cases first */ if (!strcasecmp(hgp->coarsepartition_str, "no") || !strcasecmp(hgp->coarsepartition_str, "none")) { /* Do no coarse partitioning. */ /* Do a sanity test and mapping to parts [0,...,numPart-1] */ int first = 1; PHGComm *hgc=phg->comm; Zoltan_Srand_Sync (Zoltan_Rand(NULL), &(hgc->RNGState_col), hgc->col_comm); if (hgp->UsePrefPart) { for (i = 0; i < phg->nVtx; i++) { /* Impose fixed vertex/preferred part constraints. */ if (phg->pref_part[i] < 0) { /* Free vertex in fixedvertex partitioning or repart */ /* randomly assigned to a part */ part[i] = Zoltan_Rand_InRange(&(hgc->RNGState_col), numPart); } else { if (phg->bisec_split < 0) /* direct k-way, use part numbers directly */ part[i] = phg->pref_part[i]; else /* recursive bisection, map to 0-1 part numbers */ part[i] = (phg->pref_part[i] < phg->bisec_split ? 0 : 1); } } } else { for (i = 0; i < phg->nVtx; i++) { if (part[i] >= numPart || part[i]<0) { if (first) { ZOLTAN_PRINT_WARN(zz->Proc, yo, "Initial part number > numParts."); first = 0; ierr = ZOLTAN_WARN; } part[i] = ((part[i]<0) ? -part[i] : part[i]) % numPart; } } } } else if (numPart == 1) { /* everything goes in the one partition */ for (i = 0; i < phg->nVtx; i++) part[i] = 0; } else if (!hgp->UsePrefPart && numPart >= phg->dist_x[phg->comm->nProc_x]) { /* more partitions than vertices, trivial answer */ for (i = 0; i < phg->nVtx; i++) part[i] = phg->dist_x[phg->comm->myProc_x]+i; } else if (local_coarse_part) { /* Apply local partitioner to each column */ ierr = local_coarse_partitioner(zz, phg, numPart, part_sizes, part, hgp, hgp->CoarsePartition); } else { /* Normal case: * Gather distributed HG to each processor; * compute different partitioning on each processor; * select the "best" result. */ ZOLTAN_PHG_COARSEPARTITION_FN *CoarsePartition; /* Select different coarse partitioners for processors here. */ CoarsePartition = hgp->CoarsePartition; if (CoarsePartition == NULL) { /* auto */ /* Select a coarse partitioner from the array of coarse partitioners */ CoarsePartition = CoarsePartitionFns[phg->comm->myProc % NUM_COARSEPARTITION_FNS]; } if (phg->comm->nProc == 1) { /* Serial and parallel hgraph are the same. */ shg = phg; } else { /* Set up a serial communication struct for gathered HG */ if (first_time) { scomm.nProc_x = scomm.nProc_y = 1; scomm.myProc_x = scomm.myProc_y = 0; scomm.Communicator = MPI_COMM_SELF; scomm.row_comm = MPI_COMM_SELF; scomm.col_comm = MPI_COMM_SELF; scomm.myProc = 0; scomm.nProc = 1; first_time = 0; } scomm.RNGState = Zoltan_Rand(NULL); scomm.RNGState_row = Zoltan_Rand(NULL); scomm.RNGState_col = Zoltan_Rand(NULL); scomm.zz = zz; /* * Gather parallel hypergraph phg to each processor, creating * serial hypergraph shg. */ if (fine_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->cpart, phg->comm->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->cpgather, phg->comm->Communicator); } ierr = Zoltan_PHG_Gather_To_All_Procs(zz, phg, hgp, &scomm, &shg); if (ierr < 0) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Error returned from gather."); goto End; } if (fine_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->cpgather, phg->comm->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->cpart, phg->comm->Communicator); } } /* * Allocate partition array spart for the serial hypergraph shg * and partition shg. */ spart = (int *) ZOLTAN_CALLOC(shg->nVtx * (NUM_PART_KEEP+1), sizeof(int)); bestvals = (float *) ZOLTAN_MALLOC((NUM_PART_KEEP+1)*sizeof(int)); if ((!spart) || (!bestvals)) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Out of memory."); ierr = ZOLTAN_MEMERR; goto End; } /* Compute several coarse partitionings. */ /* Keep the NUM_PART_KEEP best ones around. */ /* Currently, only the best one is used. */ /* Set RNG so different procs compute different parts. */ Zoltan_Srand(Zoltan_Rand(NULL) + zz->Proc, NULL); new_cand = 0; new_part = spart; for (i=0; i< num_coarse_iter; i++){ int savefmlooplimit=hgp->fm_loop_limit; /* Overwrite worst partition with new candidate. */ ierr = CoarsePartition(zz, shg, numPart, part_sizes, new_part, hgp); if (ierr < 0) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Error returned from CoarsePartition."); goto End; } /* time refinement step in coarse partitioner */ if (fine_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->cpart, phg->comm->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->cprefine, phg->comm->Communicator); } /* UVCUVC: Refine new candidate: only one pass is enough. */ hgp->fm_loop_limit = 1; Zoltan_PHG_Refinement(zz, shg, numPart, part_sizes, new_part, hgp); hgp->fm_loop_limit = savefmlooplimit; /* stop refinement timer */ if (fine_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->cprefine, phg->comm->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->cpart, phg->comm->Communicator); } /* Decide if candidate is in the top tier or not. */ /* Our objective is a combination of cuts and balance */ bal = Zoltan_PHG_Compute_Balance(zz, shg, part_sizes, 0, numPart, new_part); cut = Zoltan_PHG_Compute_ConCut(shg->comm, shg, new_part, numPart, &ierr); /* Use ratio-cut as our objective. There are many other options! */ bestvals[new_cand] = cut/(MAX(2.-bal, 0.0001)); /* avoid divide-by-0 */ if (ierr < 0) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Error returned from Zoltan_PHG_Compute_ConCut."); goto End; } if (i<NUM_PART_KEEP) new_cand = i+1; else { /* find worst partition vector, to overwrite it */ /* future optimization: keep bestvals sorted */ worst = 0; worst_cut = bestvals[0]; for (j=1; j<NUM_PART_KEEP+1; j++){ if (worst_cut < bestvals[j]){ worst_cut = bestvals[j]; worst = j; } } new_cand = worst; } new_part = spart+new_cand*(shg->nVtx); } /* Copy last partition vector such that all the best ones are contiguous starting at spart. */ for (i=0; i<shg->nVtx; i++){ new_part[i] = spart[NUM_PART_KEEP*(shg->nVtx)+i]; } /* Also update bestvals */ bestvals[new_cand] = bestvals[NUM_PART_KEEP]; /* Evaluate and select the best. */ /* For now, only pick the best one, in the future we pick the k best. */ ierr = pick_best(zz, hgp, phg->comm, shg, numPart, MIN(NUM_PART_KEEP, num_coarse_iter), spart, bestvals); if (ierr < 0) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Error returned from pick_best."); goto End; } if (phg->comm->nProc > 1) { /* Map gathered partition back to 2D distribution */ for (i = 0; i < phg->nVtx; i++) { /* KDDKDD Assume vertices in serial HG are ordered by GNO of phg */ si = VTX_LNO_TO_GNO(phg, i); part[i] = spart[si]; } Zoltan_HG_HGraph_Free(shg); ZOLTAN_FREE(&shg); } else { /* single processor */ for (i = 0; i < phg->nVtx; i++) part[i] = spart[i]; } ZOLTAN_FREE(&spart); ZOLTAN_FREE(&bestvals); } End: if (fine_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer->cpart, phg->comm->Communicator); ZOLTAN_TRACE_EXIT(zz, yo); return ierr; }