int Zoltan_Input_HG_Free(ZHG *zhg) { Zoltan_HG_HGraph_Free(&zhg->HG); ZOLTAN_FREE(&(zhg->objWeight)); ZOLTAN_FREE(&(zhg->objGNO)); ZOLTAN_FREE(&(zhg->objGID)); ZOLTAN_FREE(&(zhg->objLID)); ZOLTAN_FREE(&(zhg->numHEdges)); ZOLTAN_FREE(&(zhg->coor)); ZOLTAN_FREE(&(zhg->fixed)); ZOLTAN_FREE(&(zhg->Input_Parts)); ZOLTAN_FREE(&(zhg->Output_Parts)); ZOLTAN_FREE(&(zhg->AppObjSizes)); ZOLTAN_FREE(&(zhg->edgeGNO)); ZOLTAN_FREE(&(zhg->Esize)); ZOLTAN_FREE(&(zhg->Ewgt)); ZOLTAN_FREE(&(zhg->pinGNO)); ZOLTAN_FREE(&(zhg->Pin_Procs)); ZOLTAN_FREE(&(zhg->Recv_GNOs)); Zoltan_Comm_Destroy(&(zhg->VtxPlan)); return ZOLTAN_OK; }
void Zoltan_PHG_Free_Hypergraph_Data(ZHG *zoltan_hg) { if (zoltan_hg != NULL) { Zoltan_Multifree(__FILE__, __LINE__, 12, &zoltan_hg->GIDs, &zoltan_hg->LIDs, &zoltan_hg->Input_Parts, &zoltan_hg->Output_Parts, &zoltan_hg->AppObjSizes, &zoltan_hg->Remove_EGIDs, &zoltan_hg->Remove_ELIDs, &zoltan_hg->Remove_Esize, &zoltan_hg->Remove_GEsize, &zoltan_hg->Remove_Ewgt, &zoltan_hg->Remove_Pin_GIDs, &zoltan_hg->Remove_Pin_Procs); Zoltan_HG_HGraph_Free (&zoltan_hg->HG); } }
/* Main partitioning function for hypergraph partitioning. */ int Zoltan_PHG_Partition ( ZZ *zz, /* Zoltan data structure */ HGraph *hg, /* Input hypergraph to be partitioned */ int p, /* Input: number partitions to be generated */ float *part_sizes, /* Input: array of length p containing percentages of work to be assigned to each partition */ Partition parts, /* Input: initial partition #s; aligned with vtx arrays. Output: computed partition #s */ PHGPartParams *hgp, /* Input: parameters for hgraph partitioning. */ int level) { PHGComm *hgc = hg->comm; VCycle *vcycle=NULL, *del=NULL; int i, err = ZOLTAN_OK; int prevVcnt = 2*hg->dist_x[hgc->nProc_x]; int prevVedgecnt = 2*hg->dist_y[hgc->nProc_y]; char *yo = "Zoltan_PHG_Partition"; static int timer_match = -1, /* Timers for various stages */ timer_coarse = -1, /* Declared static so we can accumulate */ timer_refine = -1, /* times over calls to Zoltan_PHG_Partition */ timer_coarsepart = -1, timer_project = -1, timer_vcycle = -1; /* times everything in Vcycle not included in above timers */ int do_timing = (hgp->use_timers > 1); int vcycle_timing = (hgp->use_timers > 4); ZOLTAN_TRACE_ENTER(zz, yo); if (do_timing) { if (timer_vcycle < 0) timer_vcycle = Zoltan_Timer_Init(zz->ZTime, 0, "Vcycle"); if (timer_match < 0) timer_match = Zoltan_Timer_Init(zz->ZTime, 1, "Matching"); if (timer_coarse < 0) timer_coarse = Zoltan_Timer_Init(zz->ZTime, 1, "Coarsening"); if (timer_coarsepart < 0) timer_coarsepart = Zoltan_Timer_Init(zz->ZTime, 1, "Coarse_Partition"); if (timer_refine < 0) timer_refine = Zoltan_Timer_Init(zz->ZTime, 1, "Refinement"); if (timer_project < 0) timer_project = Zoltan_Timer_Init(zz->ZTime, 1, "Project_Up"); ZOLTAN_TIMER_START(zz->ZTime, timer_vcycle, hgc->Communicator); } if (!(vcycle = newVCycle(zz, hg, parts, NULL, vcycle_timing))) { ZOLTAN_PRINT_ERROR (zz->Proc, yo, "VCycle is NULL."); return ZOLTAN_MEMERR; } /****** Coarsening ******/ #define COARSEN_FRACTION_LIMIT 0.9 /* Stop if we don't make much progress */ while ((hg->redl>0) && (hg->dist_x[hgc->nProc_x] > hg->redl) && ((hg->dist_x[hgc->nProc_x] < (int) (COARSEN_FRACTION_LIMIT * prevVcnt + 0.5)) || (hg->dist_y[hgc->nProc_y] < (int) (COARSEN_FRACTION_LIMIT * prevVedgecnt + 0.5))) && hg->dist_y[hgc->nProc_y] && hgp->matching) { int *match = NULL; VCycle *coarser=NULL; prevVcnt = hg->dist_x[hgc->nProc_x]; prevVedgecnt = hg->dist_y[hgc->nProc_y]; #ifdef _DEBUG /* UVC: load balance stats */ Zoltan_PHG_LoadBalStat(zz, hg); #endif if (hgp->output_level >= PHG_DEBUG_LIST) { uprintf(hgc, "START %3d |V|=%6d |E|=%6d #pins=%6d %d/%s/%s/%s p=%d...\n", hg->info, hg->nVtx, hg->nEdge, hg->nPins, hg->redl, hgp->redm_str, hgp->coarsepartition_str, hgp->refinement_str, p); if (hgp->output_level > PHG_DEBUG_LIST) { err = Zoltan_HG_Info(zz, hg); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) goto End; } } if (hgp->output_level >= PHG_DEBUG_PLOT) Zoltan_PHG_Plot(zz->Proc, hg->nVtx, p, hg->vindex, hg->vedge, NULL, "coarsening plot"); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer_vcycle, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer_match, hgc->Communicator); } if (vcycle_timing) { if (vcycle->timer_match < 0) { char str[80]; sprintf(str, "VC Matching %d", hg->info); vcycle->timer_match = Zoltan_Timer_Init(vcycle->timer, 0, str); } ZOLTAN_TIMER_START(vcycle->timer, vcycle->timer_match, hgc->Communicator); } /* Allocate and initialize Matching Array */ if (hg->nVtx && !(match = (int*) ZOLTAN_MALLOC (hg->nVtx*sizeof(int)))) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Insufficient memory: Matching array"); return ZOLTAN_MEMERR; } for (i = 0; i < hg->nVtx; i++) match[i] = i; /* Calculate matching (packing or grouping) */ err = Zoltan_PHG_Matching (zz, hg, match, hgp); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) { ZOLTAN_FREE ((void**) &match); goto End; } if (vcycle_timing) ZOLTAN_TIMER_STOP(vcycle->timer, vcycle->timer_match, hgc->Communicator); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer_match, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer_coarse, hgc->Communicator); } if (vcycle_timing) { if (vcycle->timer_coarse < 0) { char str[80]; sprintf(str, "VC Coarsening %d", hg->info); vcycle->timer_coarse = Zoltan_Timer_Init(vcycle->timer, 0, str); } ZOLTAN_TIMER_START(vcycle->timer, vcycle->timer_coarse, hgc->Communicator); } if (!(coarser = newVCycle(zz, NULL, NULL, vcycle, vcycle_timing))) { ZOLTAN_FREE ((void**) &match); ZOLTAN_PRINT_ERROR (zz->Proc, yo, "coarser is NULL."); goto End; } /* Construct coarse hypergraph and LevelMap */ err = Zoltan_PHG_Coarsening (zz, hg, match, coarser->hg, vcycle->LevelMap, &vcycle->LevelCnt, &vcycle->LevelSndCnt, &vcycle->LevelData, &vcycle->comm_plan, hgp); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) goto End; if (vcycle_timing) ZOLTAN_TIMER_STOP(vcycle->timer, vcycle->timer_coarse, hgc->Communicator); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer_coarse, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer_vcycle, hgc->Communicator); } ZOLTAN_FREE ((void**) &match); if ((err=allocVCycle(coarser))!= ZOLTAN_OK) goto End; vcycle = coarser; hg = vcycle->hg; } if (hgp->output_level >= PHG_DEBUG_LIST) { uprintf(hgc, "START %3d |V|=%6d |E|=%6d #pins=%6d %d/%s/%s/%s p=%d...\n", hg->info, hg->nVtx, hg->nEdge, hg->nPins, hg->redl, hgp->redm_str, hgp->coarsepartition_str, hgp->refinement_str, p); if (hgp->output_level > PHG_DEBUG_LIST) { err = Zoltan_HG_Info(zz, hg); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) goto End; } } if (hgp->output_level >= PHG_DEBUG_PLOT) Zoltan_PHG_Plot(zz->Proc, hg->nVtx, p, hg->vindex, hg->vedge, NULL, "coarsening plot"); /* free array that may have been allocated in matching */ if (hgp->vtx_scal) ZOLTAN_FREE(&(hgp->vtx_scal)); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer_vcycle, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer_coarsepart, hgc->Communicator); } /****** Coarse Partitioning ******/ err = Zoltan_PHG_CoarsePartition (zz, hg, p, part_sizes, vcycle->Part, hgp); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) goto End; if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer_coarsepart, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer_vcycle, hgc->Communicator); } del = vcycle; /****** Uncoarsening/Refinement ******/ while (vcycle) { VCycle *finer = vcycle->finer; hg = vcycle->hg; if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer_vcycle, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer_refine, hgc->Communicator); } if (vcycle_timing) { if (vcycle->timer_refine < 0) { char str[80]; sprintf(str, "VC Refinement %d", hg->info); vcycle->timer_refine = Zoltan_Timer_Init(vcycle->timer, 0, str); } ZOLTAN_TIMER_START(vcycle->timer, vcycle->timer_refine, hgc->Communicator); } err = Zoltan_PHG_Refinement (zz, hg, p, part_sizes, vcycle->Part, hgp); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer_refine, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer_vcycle, hgc->Communicator); } if (vcycle_timing) ZOLTAN_TIMER_STOP(vcycle->timer, vcycle->timer_refine, hgc->Communicator); if (hgp->output_level >= PHG_DEBUG_LIST) uprintf(hgc, "FINAL %3d |V|=%6d |E|=%6d #pins=%6d %d/%s/%s/%s p=%d bal=%.2f cutl=%.2f\n", hg->info, hg->nVtx, hg->nEdge, hg->nPins, hg->redl, hgp->redm_str, hgp->coarsepartition_str, hgp->refinement_str, p, Zoltan_PHG_Compute_Balance(zz, hg, part_sizes, p, vcycle->Part), Zoltan_PHG_Compute_ConCut(hgc, hg, vcycle->Part, p, &err)); if (hgp->output_level >= PHG_DEBUG_PLOT) Zoltan_PHG_Plot(zz->Proc, hg->nVtx, p, hg->vindex, hg->vedge, vcycle->Part, "partitioned plot"); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer_vcycle, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer_project, hgc->Communicator); } if (vcycle_timing) { if (vcycle->timer_project < 0) { char str[80]; sprintf(str, "VC Project Up %d", hg->info); vcycle->timer_project = Zoltan_Timer_Init(vcycle->timer, 0, str); } ZOLTAN_TIMER_START(vcycle->timer, vcycle->timer_project, hgc->Communicator); } /* Project coarse partition to fine partition */ if (finer) { int *rbuffer; /* easy to undo internal matches */ for (i = 0; i < finer->hg->nVtx; i++) if (finer->LevelMap[i] >= 0) finer->Part[i] = vcycle->Part[finer->LevelMap[i]]; /* fill sendbuffer with part data for external matches I owned */ for (i = 0; i < finer->LevelCnt; i++) { ++i; /* skip return lno */ finer->LevelData[i] = finer->Part[finer->LevelData[i]]; } /* allocate rec buffer */ rbuffer = NULL; if (finer->LevelSndCnt > 0) { rbuffer = (int*) ZOLTAN_MALLOC (2 * finer->LevelSndCnt * sizeof(int)); if (!rbuffer) { ZOLTAN_PRINT_ERROR (zz->Proc, yo, "Insufficient memory."); return ZOLTAN_MEMERR; } } /* get partition assignments from owners of externally matchted vtxs */ Zoltan_Comm_Resize (finer->comm_plan, NULL, COMM_TAG, &i); Zoltan_Comm_Do_Reverse (finer->comm_plan, COMM_TAG+1, (char*) finer->LevelData, 2 * sizeof(int), NULL, (char*) rbuffer); /* process data to undo external matches */ for (i = 0; i < 2 * finer->LevelSndCnt;) { int lno, partition; lno = rbuffer[i++]; partition = rbuffer[i++]; finer->Part[lno] = partition; } ZOLTAN_FREE (&rbuffer); Zoltan_Comm_Destroy (&finer->comm_plan); } if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer_project, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer_vcycle, hgc->Communicator); } if (vcycle_timing) ZOLTAN_TIMER_STOP(vcycle->timer, vcycle->timer_project, hgc->Communicator); vcycle = finer; } /* while (vcycle) */ End: vcycle = del; while (vcycle) { if (vcycle_timing) { Zoltan_Timer_PrintAll(vcycle->timer, 0, hgc->Communicator, stdout); Zoltan_Timer_Destroy(&vcycle->timer); } if (vcycle->finer) { /* cleanup by level */ Zoltan_HG_HGraph_Free (vcycle->hg); Zoltan_Multifree (__FILE__, __LINE__, 4, &vcycle->Part, &vcycle->LevelMap, &vcycle->LevelData, &vcycle->hg); } else /* cleanup top level */ Zoltan_Multifree (__FILE__, __LINE__, 2, &vcycle->LevelMap, &vcycle->LevelData); del = vcycle; vcycle = vcycle->finer; ZOLTAN_FREE(&del); } if (do_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer_vcycle, hgc->Communicator); ZOLTAN_TRACE_EXIT(zz, yo) ; return err; }
static int gather_and_build_remap( ZZ *zz, int *new_map, /* Upon return, flag indicating whether parts assignments were changed due to remap. */ int HEcnt, /* # of HEs allocated. */ int *HEinfo /* Array of HE info; for each HE, two pins and one edge weight. Stored as a single vector to minimize communication calls. */ ) { char *yo = "gather_and_remap"; int ierr = ZOLTAN_OK; int i, uidx, tmp; int *each_size = NULL; /* sizes (# HEs * HEINFO_ENTRIES) for each proc */ int *recvbuf = NULL; /* Receive buffer for gatherv */ int *displs = NULL; /* Displacement buffer for gatherv */ int send_size; /* Local # HEs * HEINFO_ENTRIES */ int total_size; /* Total # ints in gatherv */ int total_HEcnt; /* Total (across all procs) number of HEs. */ int max0, max1; /* Max values of pin 0 and pin 1 for each HE. */ int *match = NULL; /* Vector describing the matching. match[i] = j ==> match[j] = i ==> vertices i and j are matched. */ int *used = NULL; /* Vector indicating which partitions are used in the matching. */ int limit; /* Maximum number of matches that are allowed */ HGraph hg; /* Hypergraph for matching */ float before_remap = 0, /* Amount of data that overlaps between old and */ after_remap = 0; /* new decomposition before and after remapping, respectively. */ float with_oldremap = 0; /* Amount of data that overlaps between old and new decomposition using the OldRemap vector (remapping from the previous decomposition). */ /* Gather HEs from each processor into a local complete HG. */ each_size = (int *) ZOLTAN_MALLOC(zz->Num_Proc * sizeof(int)); if (!each_size) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error."); ierr = ZOLTAN_MEMERR; goto End; } send_size = HEcnt * HEINFO_ENTRIES; MPI_Allgather(&send_size, 1, MPI_INT, each_size, 1, MPI_INT, zz->Communicator); for (total_size = 0, i = 0; i < zz->Num_Proc; i++) { total_size += each_size[i]; } recvbuf = (int *) ZOLTAN_MALLOC((zz->Num_Proc + total_size) * sizeof(int)); displs = recvbuf + total_size; if (!recvbuf) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error."); ierr = ZOLTAN_MEMERR; goto End; } displs[0] = 0; for (i = 1; i < zz->Num_Proc; i++) displs[i] = displs[i-1] + each_size[i-1]; MPI_Allgatherv(HEinfo, send_size, MPI_INT, recvbuf, each_size, displs, MPI_INT, zz->Communicator); total_HEcnt = total_size / HEINFO_ENTRIES; for (max0 = -1, max1 = -1, i = 0; i < total_HEcnt; i++) { tmp = i * HEINFO_ENTRIES; if (recvbuf[tmp] > max0) max0 = recvbuf[tmp]; if (recvbuf[tmp+1] > max1) max1 = recvbuf[tmp+1]; } /* Increment max0 and max1 so that they are the maximum number of unique pin values for pin0 and pin1 respectively; i.e., allow pin value == 0. */ max0++; max1++; /* Sanity check */ /* Ideally, max1 should equal LB.Num_Global_Parts, but ParMETIS3 sometimes * does not return the correct number of non-empty partitions, allowing * max1 to be less than LB.Num_Global_Parts. * (e.g., ewgt.adaptive-partlocal1-v3.4.?). */ if (max1 > zz->LB.Num_Global_Parts) ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Unexpected value for max1."); /* Set up global HG */ Zoltan_HG_HGraph_Init(&hg); if (total_HEcnt) { hg.nVtx = max0 + zz->LB.Num_Global_Parts; hg.nEdge = total_HEcnt; hg.nPins = total_HEcnt * 2; /* two pins per HE */ hg.EdgeWeightDim = 1; hg.ewgt = (float *) ZOLTAN_MALLOC(total_HEcnt * sizeof(float)); hg.hindex = (int *) ZOLTAN_MALLOC((total_HEcnt + 1) * sizeof(int)); hg.hvertex = (int *) ZOLTAN_MALLOC((hg.nPins) * sizeof(int)); if (!hg.ewgt || !hg.hindex || !hg.hvertex) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error."); ierr = ZOLTAN_MEMERR; goto End; } for (i = 0; i < total_HEcnt; i++) { tmp = i * HEINFO_ENTRIES; hg.hindex[i] = i+i; hg.hvertex[i+i] = recvbuf[tmp]; hg.hvertex[i+i+1] = recvbuf[tmp+1]+max0; hg.ewgt[i] = recvbuf[tmp+2]; } hg.hindex[total_HEcnt] = total_HEcnt + total_HEcnt; ierr = Zoltan_HG_Create_Mirror(zz, &hg); if (ierr < 0) goto End; } before_remap = measure_stays(zz, &hg, max0, NULL, "BEFORE"); /* Compute the amount of overlap when using the old remap vector. */ with_oldremap = measure_stays(zz, &hg, max0, zz->LB.OldRemap, "WITHOLD"); /* Do matching */ match = (int *) ZOLTAN_CALLOC(hg.nVtx + zz->LB.Num_Global_Parts, sizeof(int)); used = match + hg.nVtx; if (hg.nVtx && !match) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error."); ierr = ZOLTAN_MEMERR; goto End; } /* Max # matches allowed */ limit = (max0 < zz->LB.Num_Global_Parts ? max0 : zz->LB.Num_Global_Parts); do_match(zz, &hg, match, limit); /* Build remapping vector, if non-trivial matching was returned. */ *new_map = 0; for (i = 0; i < zz->LB.Num_Global_Parts; i++) if (match[i+max0] != i+max0) { *new_map = 1; break; } if (*new_map) { zz->LB.Remap = (int *) ZOLTAN_MALLOC(zz->LB.Num_Global_Parts * sizeof(int)); if (!(zz->LB.Remap)) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error."); ierr = ZOLTAN_MEMERR; goto End; } /* First, process all parts that were matched. Mark matched parts as used.*/ for (i = 0; i < zz->LB.Num_Global_Parts; i++) { zz->LB.Remap[i] = -1; tmp = match[i+max0]; if (tmp != i+max0) { zz->LB.Remap[i] = tmp; used[tmp] = 1; } } /* Second, process unmatched parts; if possible, keep same part number. */ for (i = 0; i < zz->LB.Num_Global_Parts; i++) { if (zz->LB.Remap[i] > -1) continue; /* Already processed part i */ /* match[i+max0] == i+max0 */ if (!used[i]) { /* Keep the same part number if it is not used */ zz->LB.Remap[i] = i; used[i] = 1; } } /* Third, process remaining unmatched parts; assign them to unused partitions.*/ for (uidx = 0, i = 0; i < zz->LB.Num_Global_Parts; i++) { if (zz->LB.Remap[i] > -1) continue; /* Already processed part i */ /* match[i+max0] == i+max0 */ while (used[uidx]) uidx++; /* Find next unused partition */ zz->LB.Remap[i] = uidx; used[uidx] = 1; } } if (*new_map) after_remap = measure_stays(zz, &hg, max0, zz->LB.Remap, "AFTER "); if ((before_remap >= after_remap) && (before_remap >= with_oldremap)) { /* No benefit from remapping; don't keep it! */ ZOLTAN_FREE(&zz->LB.Remap); ZOLTAN_FREE(&zz->LB.OldRemap); *new_map = 0; } else if (with_oldremap >= after_remap) { /* The old remap vector is better than the new one; keep the old one. */ ZOLTAN_FREE(&zz->LB.Remap); zz->LB.Remap = zz->LB.OldRemap; zz->LB.OldRemap = NULL; *new_map = 1; } else { /* Going to use the new remap vector; free the old one. */ ZOLTAN_FREE(&zz->LB.OldRemap); } if (zz->Debug_Level >= ZOLTAN_DEBUG_ALL && zz->Proc == zz->Debug_Proc && zz->LB.Remap) for (i = 0; i < zz->LB.Num_Global_Parts; i++) printf("%d REMAP Part %d to Part %d\n", zz->Proc, i, zz->LB.Remap[i]); End: ZOLTAN_FREE(&match); ZOLTAN_FREE(&each_size); ZOLTAN_FREE(&recvbuf); Zoltan_HG_HGraph_Free(&hg); return ierr; }
/* Main partitioning function for hypergraph partitioning. */ int Zoltan_PHG_Partition ( ZZ *zz, /* Zoltan data structure */ HGraph *hg, /* Input hypergraph to be partitioned */ int p, /* Input: number partitions to be generated */ float *part_sizes, /* Input: array of length p containing percentages of work to be assigned to each partition */ Partition parts, /* Input: initial partition #s; aligned with vtx arrays. Output: computed partition #s */ PHGPartParams *hgp) /* Input: parameters for hgraph partitioning. */ { PHGComm *hgc = hg->comm; VCycle *vcycle=NULL, *del=NULL; int i, err = ZOLTAN_OK, middle; ZOLTAN_GNO_TYPE origVpincnt; /* for processor reduction test */ ZOLTAN_GNO_TYPE prevVcnt = 2*hg->dist_x[hgc->nProc_x]; /* initialized so that the */ ZOLTAN_GNO_TYPE prevVedgecnt = 2*hg->dist_y[hgc->nProc_y]; /* while loop will be entered before any coarsening */ ZOLTAN_GNO_TYPE tot_nPins, local_nPins; MPI_Datatype zoltan_gno_mpi_type; char *yo = "Zoltan_PHG_Partition"; int do_timing = (hgp->use_timers > 1); int fine_timing = (hgp->use_timers > 2); int vcycle_timing = (hgp->use_timers > 4 && hgp->ProRedL == 0); short refine = 0; struct phg_timer_indices *timer = Zoltan_PHG_LB_Data_timers(zz); int reset_geometric_matching = 0; char reset_geometric_string[4]; ZOLTAN_TRACE_ENTER(zz, yo); zoltan_gno_mpi_type = Zoltan_mpi_gno_type(); if (do_timing) { if (timer->vcycle < 0) timer->vcycle = Zoltan_Timer_Init(zz->ZTime, 0, "Vcycle"); if (timer->procred < 0) timer->procred = Zoltan_Timer_Init(zz->ZTime, 0, "Processor Reduction"); if (timer->match < 0) timer->match = Zoltan_Timer_Init(zz->ZTime, 1, "Matching"); if (timer->coarse < 0) timer->coarse = Zoltan_Timer_Init(zz->ZTime, 1, "Coarsening"); if (timer->coarsepart < 0) timer->coarsepart = Zoltan_Timer_Init(zz->ZTime, 1, "Coarse_Partition"); if (timer->refine < 0) timer->refine = Zoltan_Timer_Init(zz->ZTime, 1, "Refinement"); if (timer->project < 0) timer->project = Zoltan_Timer_Init(zz->ZTime, 1, "Project_Up"); ZOLTAN_TIMER_START(zz->ZTime, timer->vcycle, hgc->Communicator); } local_nPins = (ZOLTAN_GNO_TYPE)hg->nPins; MPI_Allreduce(&local_nPins,&tot_nPins,1,zoltan_gno_mpi_type,MPI_SUM,hgc->Communicator); origVpincnt = tot_nPins; if (!(vcycle = newVCycle(zz, hg, parts, NULL, vcycle_timing))) { ZOLTAN_PRINT_ERROR (zz->Proc, yo, "VCycle is NULL."); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_MEMERR; } /* For geometric coarsening, hgp->matching pointer and string are reset * after geometric_levels of coarsening. Will need to reset them after * this vcycle is completed. Capture that fact now! */ if (!strcasecmp(hgp->redm_str, "rcb") || !strcasecmp(hgp->redm_str, "rib")) { reset_geometric_matching = 1; strcpy(reset_geometric_string, hgp->redm_str); } /****** Coarsening ******/ #define COARSEN_FRACTION_LIMIT 0.9 /* Stop if we don't make much progress */ while ((hg->redl>0) && (hg->dist_x[hgc->nProc_x] > (ZOLTAN_GNO_TYPE)hg->redl) && ((hg->dist_x[hgc->nProc_x] < (ZOLTAN_GNO_TYPE) (COARSEN_FRACTION_LIMIT * prevVcnt + 0.5)) /* prevVcnt initialized to 2*hg->dist_x[hgc->nProc_x] */ || (hg->dist_y[hgc->nProc_y] < (ZOLTAN_GNO_TYPE) (COARSEN_FRACTION_LIMIT * prevVedgecnt + 0.5))) /* prevVedgecnt initialized to 2*hg->dist_y[hgc->nProc_y] */ && hg->dist_y[hgc->nProc_y] && hgp->matching) { ZOLTAN_GNO_TYPE *match = NULL; VCycle *coarser=NULL, *redistributed=NULL; prevVcnt = hg->dist_x[hgc->nProc_x]; prevVedgecnt = hg->dist_y[hgc->nProc_y]; #ifdef _DEBUG /* UVC: load balance stats */ Zoltan_PHG_LoadBalStat(zz, hg); #endif if (hgp->output_level >= PHG_DEBUG_LIST) { uprintf(hgc, "START %3d |V|=%6d |E|=%6d #pins=%6d %d/%s/%s/%s p=%d...\n", hg->info, hg->nVtx, hg->nEdge, hg->nPins, hg->redl, hgp->redm_str, hgp->coarsepartition_str, hgp->refinement_str, p); if (hgp->output_level > PHG_DEBUG_LIST) { err = Zoltan_HG_Info(zz, hg); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) goto End; } } if (hgp->output_level >= PHG_DEBUG_PLOT) Zoltan_PHG_Plot(zz->Proc, hg->nVtx, p, hg->vindex, hg->vedge, NULL, "coarsening plot"); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->vcycle, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->match, hgc->Communicator); } if (vcycle_timing) { if (vcycle->timer_match < 0) { char str[80]; sprintf(str, "VC Matching %d", hg->info); vcycle->timer_match = Zoltan_Timer_Init(vcycle->timer, 0, str); } ZOLTAN_TIMER_START(vcycle->timer, vcycle->timer_match, hgc->Communicator); } /* Allocate and initialize Matching Array */ if (hg->nVtx && !(match = (ZOLTAN_GNO_TYPE *) ZOLTAN_MALLOC (hg->nVtx*sizeof(ZOLTAN_GNO_TYPE)))) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Insufficient memory: Matching array"); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_MEMERR; } for (i = 0; i < hg->nVtx; i++) match[i] = i; /* Calculate matching (packing or grouping) */ err = Zoltan_PHG_Matching (zz, hg, match, hgp); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) { ZOLTAN_FREE (&match); goto End; } if (vcycle_timing) ZOLTAN_TIMER_STOP(vcycle->timer, vcycle->timer_match, hgc->Communicator); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->match, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->coarse, hgc->Communicator); } if (vcycle_timing) { if (vcycle->timer_coarse < 0) { char str[80]; sprintf(str, "VC Coarsening %d", hg->info); vcycle->timer_coarse = Zoltan_Timer_Init(vcycle->timer, 0, str); } ZOLTAN_TIMER_START(vcycle->timer, vcycle->timer_coarse, hgc->Communicator); } if (!(coarser = newVCycle(zz, NULL, NULL, vcycle, vcycle_timing))) { ZOLTAN_FREE (&match); ZOLTAN_PRINT_ERROR (zz->Proc, yo, "coarser is NULL."); goto End; } /* Construct coarse hypergraph and LevelMap */ err = Zoltan_PHG_Coarsening (zz, hg, match, coarser->hg, vcycle->LevelMap, &vcycle->LevelCnt, &vcycle->LevelSndCnt, &vcycle->LevelData, &vcycle->comm_plan, hgp); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) goto End; if (vcycle_timing) ZOLTAN_TIMER_STOP(vcycle->timer, vcycle->timer_coarse, hgc->Communicator); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->coarse, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->vcycle, hgc->Communicator); } ZOLTAN_FREE (&match); if ((err=allocVCycle(coarser))!= ZOLTAN_OK) goto End; vcycle = coarser; hg = vcycle->hg; if (hgc->nProc > 1 && hgp->ProRedL > 0) { local_nPins = (ZOLTAN_GNO_TYPE)hg->nPins; MPI_Allreduce(&local_nPins, &tot_nPins, 1, zoltan_gno_mpi_type, MPI_SUM, hgc->Communicator); if (tot_nPins < (ZOLTAN_GNO_TYPE)(hgp->ProRedL * origVpincnt + 0.5)) { if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->vcycle, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->procred, hgc->Communicator); } /* redistribute to half the processors */ origVpincnt = tot_nPins; /* update for processor reduction test */ if(hg->nVtx&&!(hg->vmap=(int*)ZOLTAN_MALLOC(hg->nVtx*sizeof(int)))) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Insufficient memory: hg->vmap"); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_MEMERR; } for (i = 0; i < hg->nVtx; i++) hg->vmap[i] = i; middle = (int)((float) (hgc->nProc-1) * hgp->ProRedL); if (hgp->nProc_x_req!=1&&hgp->nProc_y_req!=1) { /* Want 2D decomp */ if ((middle+1) > SMALL_PRIME && Zoltan_PHG_isPrime(middle+1)) --middle; /* if it was prime just use one less #procs (since it should be bigger than SMALL_PRIME it is safe to decrement) */ } if (!(hgc = (PHGComm*) ZOLTAN_MALLOC (sizeof(PHGComm)))) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Insufficient memory: PHGComm"); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_MEMERR; } if (!(redistributed=newVCycle(zz,NULL,NULL,vcycle,vcycle_timing))) { ZOLTAN_FREE (&hgc); ZOLTAN_PRINT_ERROR (zz->Proc, yo, "redistributed is NULL."); goto End; } Zoltan_PHG_Redistribute(zz,hgp,hg,0,middle,hgc, redistributed->hg, &vcycle->vlno,&vcycle->vdest); if (hgp->UseFixedVtx || hgp->UsePrefPart) redistributed->hg->bisec_split = hg->bisec_split; if ((err=allocVCycle(redistributed))!= ZOLTAN_OK) goto End; vcycle = redistributed; if (hgc->myProc < 0) /* I'm not in the redistributed part so I should go to uncoarsening refinement and wait */ { if (fine_timing) { if (timer->cpgather < 0) timer->cpgather = Zoltan_Timer_Init(zz->ZTime, 1, "CP Gather"); if (timer->cprefine < 0) timer->cprefine =Zoltan_Timer_Init(zz->ZTime, 0, "CP Refine"); if (timer->cpart < 0) timer->cpart = Zoltan_Timer_Init(zz->ZTime, 0, "CP Part"); } if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->procred, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->vcycle, hgc->Communicator); } goto Refine; } hg = vcycle->hg; hg->redl = hgp->redl; /* not set with hg creation */ if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->procred, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->vcycle, hgc->Communicator); } } } } if (hgp->output_level >= PHG_DEBUG_LIST) { uprintf(hgc, "START %3d |V|=%6d |E|=%6d #pins=%6d %d/%s/%s/%s p=%d...\n", hg->info, hg->nVtx, hg->nEdge, hg->nPins, hg->redl, hgp->redm_str, hgp->coarsepartition_str, hgp->refinement_str, p); if (hgp->output_level > PHG_DEBUG_LIST) { err = Zoltan_HG_Info(zz, hg); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) goto End; } } if (hgp->output_level >= PHG_DEBUG_PLOT) Zoltan_PHG_Plot(zz->Proc, hg->nVtx, p, hg->vindex, hg->vedge, NULL, "coarsening plot"); /* free array that may have been allocated in matching */ if (hgp->vtx_scal) { hgp->vtx_scal_size = 0; ZOLTAN_FREE(&(hgp->vtx_scal)); } if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->vcycle, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->coarsepart, hgc->Communicator); } /****** Coarse Partitioning ******/ err = Zoltan_PHG_CoarsePartition (zz, hg, p, part_sizes, vcycle->Part, hgp); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) goto End; if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->coarsepart, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->vcycle, hgc->Communicator); } Refine: del = vcycle; refine = 1; /****** Uncoarsening/Refinement ******/ while (vcycle) { VCycle *finer = vcycle->finer; hg = vcycle->hg; if (refine && hgc->myProc >= 0) { if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->vcycle, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->refine, hgc->Communicator); } if (vcycle_timing) { if (vcycle->timer_refine < 0) { char str[80]; sprintf(str, "VC Refinement %d", hg->info); vcycle->timer_refine = Zoltan_Timer_Init(vcycle->timer, 0, str); } ZOLTAN_TIMER_START(vcycle->timer, vcycle->timer_refine, hgc->Communicator); } err = Zoltan_PHG_Refinement (zz, hg, p, part_sizes, vcycle->Part, hgp); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->refine, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->vcycle, hgc->Communicator); } if (vcycle_timing) ZOLTAN_TIMER_STOP(vcycle->timer, vcycle->timer_refine, hgc->Communicator); if (hgp->output_level >= PHG_DEBUG_LIST) uprintf(hgc, "FINAL %3d |V|=%6d |E|=%6d #pins=%6d %d/%s/%s/%s p=%d bal=%.2f cutl=%.2f\n", hg->info, hg->nVtx, hg->nEdge, hg->nPins, hg->redl, hgp->redm_str, hgp->coarsepartition_str, hgp->refinement_str, p, Zoltan_PHG_Compute_Balance(zz, hg, part_sizes, 0, p, vcycle->Part), Zoltan_PHG_Compute_ConCut(hgc, hg, vcycle->Part, p, &err)); if (hgp->output_level >= PHG_DEBUG_PLOT) Zoltan_PHG_Plot(zz->Proc, hg->nVtx, p, hg->vindex, hg->vedge, vcycle->Part, "partitioned plot"); } if (finer) { int *rbuffer; /* Project coarse partition to fine partition */ if (finer->comm_plan) { refine = 1; if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->vcycle, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->project, hgc->Communicator); } if (vcycle_timing) { if (vcycle->timer_project < 0) { char str[80]; sprintf(str, "VC Project Up %d", hg->info); vcycle->timer_project = Zoltan_Timer_Init(vcycle->timer, 0, str); } ZOLTAN_TIMER_START(vcycle->timer, vcycle->timer_project, hgc->Communicator); } /* easy to assign partitions to internal matches */ for (i = 0; i < finer->hg->nVtx; i++) if (finer->LevelMap[i] >= 0) /* if considers only the local vertices */ finer->Part[i] = vcycle->Part[finer->LevelMap[i]]; /* now that the course partition assignments have been propagated */ /* upward to the finer level for the local vertices, we need to */ /* fill the LevelData (matched pairs of a local vertex with a */ /* off processor vertex) with the partition assignment of the */ /* local vertex - can be done totally in the finer level! */ for (i = 0; i < finer->LevelCnt; i++) { ++i; /* skip over off processor lno */ finer->LevelData[i] = finer->Part[finer->LevelData[i]]; } /* allocate rec buffer to exchange LevelData information */ rbuffer = NULL; if (finer->LevelSndCnt > 0) { rbuffer = (int*) ZOLTAN_MALLOC (2 * finer->LevelSndCnt * sizeof(int)); if (!rbuffer) { ZOLTAN_PRINT_ERROR (zz->Proc, yo, "Insufficient memory."); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_MEMERR; } } /* get partition assignments from owners of externally matched vtxs */ Zoltan_Comm_Resize (finer->comm_plan, NULL, COMM_TAG, &i); Zoltan_Comm_Do_Reverse (finer->comm_plan, COMM_TAG+1, (char*) finer->LevelData, 2 * sizeof(int), NULL, (char*) rbuffer); /* process data to assign partitions to expernal matches */ for (i = 0; i < 2 * finer->LevelSndCnt;) { int lno, partition; lno = rbuffer[i++]; partition = rbuffer[i++]; finer->Part[lno] = partition; } ZOLTAN_FREE (&rbuffer); Zoltan_Comm_Destroy (&finer->comm_plan); if (do_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->project, hgc->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->vcycle, hgc->Communicator); } if (vcycle_timing) ZOLTAN_TIMER_STOP(vcycle->timer, vcycle->timer_project, hgc->Communicator); } else { int *sendbuf = NULL, size; refine = 0; /* ints local and partition numbers */ if (finer->vlno) { sendbuf = (int*) ZOLTAN_MALLOC (2 * hg->nVtx * sizeof(int)); if (!sendbuf) { ZOLTAN_PRINT_ERROR (zz->Proc, yo, "Insufficient memory."); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_MEMERR; } for (i = 0; i < hg->nVtx; ++i) { sendbuf[2 * i] = finer->vlno[i]; /* assign local numbers */ sendbuf[2 * i + 1] = vcycle->Part[i];/* assign partition numbers */ } } ZOLTAN_FREE (&hgc); hgc = finer->hg->comm; /* updating hgc is required when the processors change */ /* Create comm plan to unredistributed processors */ err = Zoltan_Comm_Create(&finer->comm_plan, finer->vlno ? hg->nVtx : 0, finer->vdest, hgc->Communicator, COMM_TAG+2, &size); if (err != ZOLTAN_OK && err != ZOLTAN_WARN) { ZOLTAN_PRINT_ERROR(hgc->myProc, yo, "Zoltan_Comm_Create failed."); goto End; } /* allocate rec buffer to exchange sendbuf information */ rbuffer = NULL; if (finer->hg->nVtx) { rbuffer = (int*) ZOLTAN_MALLOC (2 * finer->hg->nVtx * sizeof(int)); if (!rbuffer) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Insufficient memory."); ZOLTAN_TRACE_EXIT(zz, yo); return ZOLTAN_MEMERR; } } /* Use plan to send partitions to the unredistributed processors */ Zoltan_Comm_Do(finer->comm_plan, COMM_TAG+3, (char *) sendbuf, 2*sizeof(int), (char *) rbuffer); MPI_Bcast(rbuffer, 2*finer->hg->nVtx, MPI_INT, 0, hgc->col_comm); /* process data to assign partitions to unredistributed processors */ for (i = 0; i < 2 * finer->hg->nVtx;) { int lno, partition; lno = rbuffer[i++]; partition = rbuffer[i++]; finer->Part[lno] = partition; } if (finer->vlno) ZOLTAN_FREE (&sendbuf); ZOLTAN_FREE (&rbuffer); Zoltan_Comm_Destroy (&finer->comm_plan); } } vcycle = finer; } /* while (vcycle) */ End: vcycle = del; while (vcycle) { if (vcycle_timing) { Zoltan_Timer_PrintAll(vcycle->timer, 0, hgc->Communicator, stdout); Zoltan_Timer_Destroy(&vcycle->timer); } if (vcycle->finer) { /* cleanup by level */ Zoltan_HG_HGraph_Free (vcycle->hg); if (vcycle->LevelData) Zoltan_Multifree (__FILE__, __LINE__, 4, &vcycle->Part, &vcycle->LevelMap, &vcycle->LevelData, &vcycle->hg); else if (vcycle->vlno) Zoltan_Multifree (__FILE__, __LINE__, 5, &vcycle->Part, &vcycle->vdest, &vcycle->vlno, &vcycle->LevelMap, &vcycle->hg); else Zoltan_Multifree (__FILE__, __LINE__, 3, &vcycle->Part, &vcycle->LevelMap, &vcycle->hg); } else /* cleanup top level */ Zoltan_Multifree (__FILE__, __LINE__, 2, &vcycle->LevelMap, &vcycle->LevelData); del = vcycle; vcycle = vcycle->finer; ZOLTAN_FREE(&del); } if (reset_geometric_matching) { strcpy(hgp->redm_str, reset_geometric_string); Zoltan_PHG_Set_Matching_Fn(hgp); } if (do_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer->vcycle, hgc->Communicator); ZOLTAN_TRACE_EXIT(zz, yo) ; return err; }
int Zoltan_PHG_Gather_To_All_Procs( ZZ *zz, HGraph *phg, /* Input: Local part of distributed hypergraph */ PHGPartParams *hgp, /* Input: Hypergraph parameters */ PHGComm *scomm, /* Input: Serial PHGComm for use by shg. */ HGraph **gathered_hg /* Output: combined hypergraph combined to proc */ ) { /* * Function to gather distributed hypergraph onto each processor for * coarsest partitioning. * First hypergraph arrays for the hypergraph on a column of processors * are built using MPI_Allgathers down the processor columns. * These hypergraph arrays contain complete info about a subset of vertices. * Second the column hypergraphs are gathered along processor rows. * Each processor then has a complete description of the hypergraph. */ char *yo = "Zoltan_PHG_Gather_To_All_Procs"; int ierr = ZOLTAN_OK; int i, tmp, sum; int *each = NULL, *disp = NULL; /* Size and displacement arrays for MPI_Allgatherv */ int *send_buf = NULL; /* Buffer of values to be sent */ int send_size; /* Size of buffer send_buf */ int *col_vedge = NULL; /* vedge array for the proc-column hypergraph */ int *col_vindex = NULL; /* vindex array for the proc-column hypergraph */ int *col_hvertex = NULL; /* hvertex array for the proc-column hypergraph */ int *col_hindex = NULL; /* hindex array for the proc-column hypergraph */ int col_nVtx; /* Number of vertices in processor column */ int col_nEdge; /* Number of edges in processor column */ int col_nPin; /* Number of pins in processor column */ int *recv_size = NULL; /* nPins for each proc in col or row */ HGraph *shg; /* Pointer to the serial hypergraph to be returned by this function. */ int myProc_x = phg->comm->myProc_x; int nProc_x = phg->comm->nProc_x; int nProc_y = phg->comm->nProc_y; int max_nProc_xy = MAX(nProc_x, nProc_y); if (phg->comm->nProc == 1) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Do not call this routine on one proc."); return ZOLTAN_FATAL; } #ifdef KDDKDD_CHECK Zoltan_HG_Print(zz, phg, NULL, stdout, "GatherBefore");/* NULL parts for now; add non-NULL later */ #endif /****************************************************************** * 0. Allocate the hypergraph to be returned. * Set values that we already know. ******************************************************************/ shg = *gathered_hg = (HGraph *) ZOLTAN_MALLOC(sizeof(HGraph)); if (!shg) MEMORY_ERROR; Zoltan_HG_HGraph_Init(shg); shg->nVtx = phg->dist_x[nProc_x]; /* TODO64 - can this exceed 2B? */ shg->nEdge = phg->dist_y[nProc_y]; shg->dist_x = (ZOLTAN_GNO_TYPE *) ZOLTAN_MALLOC(2 * sizeof(ZOLTAN_GNO_TYPE)); shg->dist_y = (ZOLTAN_GNO_TYPE *) ZOLTAN_MALLOC(2 * sizeof(ZOLTAN_GNO_TYPE)); if (!shg->dist_x || !shg->dist_y) MEMORY_ERROR; shg->dist_x[0] = shg->dist_y[0] = 0; shg->dist_x[1] = shg->nVtx; shg->dist_y[1] = shg->nEdge; shg->comm = scomm; shg->EdgeWeightDim = phg->EdgeWeightDim; shg->VtxWeightDim = phg->VtxWeightDim; if (shg->VtxWeightDim && shg->nVtx) shg->vwgt = (float *) ZOLTAN_MALLOC(shg->nVtx * shg->VtxWeightDim * sizeof(float)); if (shg->EdgeWeightDim && shg->nEdge) shg->ewgt = (float *) ZOLTAN_MALLOC(shg->nEdge * shg->EdgeWeightDim * sizeof(float)); /* Fixed vertices */ shg->bisec_split = phg->bisec_split; if (hgp->UseFixedVtx) shg->fixed_part = (int *) ZOLTAN_MALLOC(shg->nVtx * sizeof(int)); if (hgp->UsePrefPart) shg->pref_part = (int *) ZOLTAN_MALLOC(shg->nVtx * sizeof(int)); /* Allocate arrays for use in gather operations */ recv_size = (int *) ZOLTAN_MALLOC(3 * max_nProc_xy * sizeof(int)); each = recv_size + max_nProc_xy; disp = each + max_nProc_xy; /* TODO64 - phg->dist_y[nProc_y] could exceed 2 Billion, NO? */ send_size = MAX(phg->dist_x[myProc_x+1] - phg->dist_x[myProc_x], phg->dist_y[nProc_y]); send_buf = (int *) ZOLTAN_MALLOC(send_size * sizeof(int)); if ((shg->VtxWeightDim && shg->nVtx && !shg->vwgt) || (shg->EdgeWeightDim && shg->nEdge && !shg->ewgt) || !recv_size || (send_size && !send_buf)) MEMORY_ERROR; /************************************************************* * 1. Gather all non-zeros for vertices in processor column * *************************************************************/ if (nProc_y == 1) { /* * Don't need a gather; just set pointers appropriately for row-gather * in Step 2 below. */ col_nVtx = phg->nVtx; col_nEdge = phg->nEdge; col_nPin = phg->nPins; col_vindex = phg->vindex; col_vedge = phg->vedge; col_hindex = phg->hindex; col_hvertex = phg->hvertex; for (i = 0; i < shg->EdgeWeightDim * shg->nEdge; i++) shg->ewgt[i] = phg->ewgt[i]; } else { /* Gather local size info for each proc in column */ MPI_Allgather(&(phg->nPins), 1, MPI_INT, recv_size, 1, MPI_INT, phg->comm->col_comm); /* Compute number of vtx, edge, and nnz in column */ col_nVtx = (int)(phg->dist_x[myProc_x+1] - phg->dist_x[myProc_x]); col_nEdge = phg->dist_y[nProc_y]; /* SCHEMEA */ col_nPin = 0; for (i = 0; i < nProc_y; i++) { col_nPin += recv_size[i]; } /* Allocate arrays for column hypergraph */ col_hindex = (int *) ZOLTAN_CALLOC((col_nEdge+1), sizeof(int)); col_hvertex = (int *) ZOLTAN_MALLOC(col_nPin * sizeof(int)); col_vindex = (int *) ZOLTAN_CALLOC((col_nVtx+1), sizeof(int)); col_vedge = (int *) ZOLTAN_MALLOC(col_nPin * sizeof(int)); if (!col_vindex || !col_hindex || (col_nPin && (!col_vedge || !col_hvertex))) MEMORY_ERROR; /* Gather hvertex data for all procs in column */ /* SCHEMEA uses same vertex LNO on each proc in column. */ /* SCHEMEB would require conversion from vertex LNO to GNO here. */ disp[0] = 0; for (i = 1; i < nProc_y; i++) disp[i] = disp[i-1] + recv_size[i-1]; MPI_Allgatherv(phg->hvertex, phg->nPins, MPI_INT, col_hvertex, recv_size, disp, MPI_INT, phg->comm->col_comm); /* SCHEMEA uses same vertex LNO on each proc in column. */ /* SCHEMEB would require conversion from vertex GNO to LNO here */ /* Gather hindex data for all procs in column */ for (i = 0; i < phg->nEdge; i++) send_buf[i] = phg->hindex[i+1] - phg->hindex[i]; /* SCHEMEA can assume a recv for each edge; * SCHEMEB needs to gather the number of edges recv'd from each proc. */ for (i = 0; i < nProc_y; i++) each[i] = phg->dist_y[i+1] - phg->dist_y[i]; disp[0] = 0; /* Can't use dist_y because it may not be sizeof(int) */ for (i=1; i < nProc_y; i++){ disp[i] = disp[i-1] + each[i-1]; } /* SCHEMEA can use phg->dist_y for displacement array. * SCHEMEB requires separate displacement array. */ MPI_Allgatherv(send_buf, phg->nEdge, MPI_INT, col_hindex, each, disp, MPI_INT, phg->comm->col_comm); /* Perform prefix sum on col_hindex */ sum = 0; for (i = 0; i < col_nEdge; i++) { tmp = col_hindex[i]; col_hindex[i] = sum; sum += tmp; } col_hindex[col_nEdge] = sum; /* Sanity check */ if (col_hindex[col_nEdge] != col_nPin) { printf("%d Sanity check failed: " "col_hindex[col_nEdge] %d != col_nPin %d\n", zz->Proc, col_hindex[col_nEdge], col_nPin); exit(-1); } /* Gather edge weights, if any. */ if (shg->EdgeWeightDim) { /* Can use nearly the same each array. */ /* Need to compute new disp array. */ disp[0] = 0; each[0] *= phg->EdgeWeightDim; for (i = 1; i < nProc_y; i++) { each[i] *= phg->EdgeWeightDim; disp[i] = disp[i-1] + each[i-1]; } MPI_Allgatherv(phg->ewgt, phg->nEdge*phg->EdgeWeightDim, MPI_FLOAT, shg->ewgt, each, disp, MPI_FLOAT, phg->comm->col_comm); } Zoltan_HG_Mirror(col_nEdge, col_hindex, col_hvertex, col_nVtx, col_vindex, col_vedge); } /* End column-gather */ /************************************************************* * 2. Gather all non-zeros for edges in processor rows * * All processors in a processor column now have the same * * hypergraph; we now gather it across rows. * *************************************************************/ if (nProc_x == 1) { /* * Don't need a gather across the row; just set pointers appropriately * in shg. */ shg->vindex = col_vindex; shg->vedge = col_vedge; shg->hindex = col_hindex; shg->hvertex = col_hvertex; /* Copy vwgt and fixed arrays so shg owns this memory */ for (i = 0; i < shg->VtxWeightDim*shg->nVtx; i++) shg->vwgt[i] = phg->vwgt[i]; if (hgp->UseFixedVtx) for (i = 0; i < shg->nVtx; i++) shg->fixed_part[i] = phg->fixed_part[i]; if (hgp->UsePrefPart) for (i = 0; i < shg->nVtx; i++) shg->pref_part[i] = phg->pref_part[i]; } else { /* Gather info about size within the row */ MPI_Allgather(&col_nPin, 1, MPI_INT, recv_size, 1, MPI_INT, phg->comm->row_comm); tmp = 0; for (i = 0; i < nProc_x; i++) tmp += recv_size[i]; shg->nPins = tmp; shg->vindex = (int *) ZOLTAN_CALLOC((shg->nVtx+1), sizeof(int)); shg->vedge = (int *) ZOLTAN_MALLOC(shg->nPins * sizeof(int)); shg->hindex = (int *) ZOLTAN_CALLOC((shg->nEdge+1), sizeof(int)); shg->hvertex = (int *) ZOLTAN_MALLOC(shg->nPins * sizeof(int)); if (!shg->vindex || !shg->hindex || (shg->nPins && (!shg->vedge || !shg->hvertex))) MEMORY_ERROR; /* Gather vedge data for all procs in row */ /* SCHEMEA can send local edge numbers; SCHEMEB requires edge LNO to GNO conversion. */ disp[0] = 0; for (i = 1; i < nProc_x; i++) disp[i] = disp[i-1] + recv_size[i-1]; MPI_Allgatherv(col_vedge, col_nPin, MPI_INT, shg->vedge, recv_size, disp, MPI_INT, phg->comm->row_comm); /* Gather vindex data for all procs in row */ for (i = 0; i < col_nVtx; i++) send_buf[i] = col_vindex[i+1] - col_vindex[i]; /* SCHEMEA can assume a recv for each vertex; * SCHEMEB would need to gather the number of vtxs recv'd from each proc. */ for (i = 0; i < nProc_x; i++) each[i] = (int)(phg->dist_x[i+1] - phg->dist_x[i]); disp[0] = 0; /* Can't use dist_x, may not be sizeof(int) */ for (i = 1; i < nProc_x; i++) disp[i] = disp[i-1] + each[i-1]; /* SCHEMEA can use phg->dist_x as displacement array; * SCHEMEB requires separate displacement array. */ MPI_Allgatherv(send_buf, col_nVtx, MPI_INT, shg->vindex, each, disp, MPI_INT, phg->comm->row_comm); /* Perform prefix sum on shg->vindex */ sum = 0; for (i = 0; i < shg->nVtx; i++) { tmp = shg->vindex[i]; shg->vindex[i] = sum; sum += tmp; } shg->vindex[shg->nVtx] = sum; /* Sanity check */ if (shg->vindex[shg->nVtx] != shg->nPins) { printf("%d Sanity check failed: " "shg->vindex %d != nPins %d\n", zz->Proc, shg->vindex[shg->nVtx], shg->nPins); exit(-1); } /* Gather fixed array, if any */ if (hgp->UseFixedVtx){ #ifdef DEBUG_ uprintf(phg->comm, "Debug in PHG_gather before gather. phg->fixed ="); for (i=0; i<phg->nVtx; i++){ printf(" %d ", phg->fixed_part[i]); } printf("\n"); #endif /* Can use the same each array. */ /* Need to compute new disp array. */ disp[0] = 0; for (i = 1; i < nProc_x; i++) { disp[i] = disp[i-1] + each[i-1]; } MPI_Allgatherv(phg->fixed_part, phg->nVtx, MPI_FLOAT, shg->fixed_part, each, disp, MPI_FLOAT, phg->comm->row_comm); #ifdef DEBUG_ uprintf(phg->comm, "Debug in PHG_gather after gather. shg->fixed ="); for (i=0; i<shg->nVtx; i++){ printf(" %d ", shg->fixed_part[i]); } printf("\n"); #endif } /* Gather pref part array, if any */ if (hgp->UsePrefPart){ /* Can use the same each array. */ /* Need to compute new disp array. */ disp[0] = 0; for (i = 1; i < nProc_x; i++) { disp[i] = disp[i-1] + each[i-1]; } MPI_Allgatherv(phg->pref_part, phg->nVtx, MPI_FLOAT, shg->pref_part, each, disp, MPI_FLOAT, phg->comm->row_comm); } /* Gather vertex weights, if any. */ if (shg->VtxWeightDim) { /* Can use nearly the same each array. */ /* Need to compute new disp array. */ disp[0] = 0; each[0] *= phg->VtxWeightDim; for (i = 1; i < nProc_x; i++) { each[i] *= phg->VtxWeightDim; disp[i] = disp[i-1] + each[i-1]; } MPI_Allgatherv(phg->vwgt, phg->nVtx*phg->VtxWeightDim, MPI_FLOAT, shg->vwgt, each, disp, MPI_FLOAT, phg->comm->row_comm); } Zoltan_HG_Mirror(shg->nVtx, shg->vindex, shg->vedge, shg->nEdge, shg->hindex, shg->hvertex); } /* End row gather */ #ifdef KDDKDD_CHECK Zoltan_HG_Print(zz, shg, NULL, stdout, "GatherAfter");/* NULL parts for now; add non-NULL later */ Zoltan_PHG_Plot_2D_Distrib(zz, phg); Zoltan_PHG_Plot_2D_Distrib(zz, shg); #endif End: if (ierr < 0) { Zoltan_HG_HGraph_Free(*gathered_hg); ZOLTAN_FREE(gathered_hg); } Zoltan_Multifree(__FILE__, __LINE__, 2, &send_buf, &recv_size); if (nProc_x > 1 && nProc_y > 1) Zoltan_Multifree(__FILE__, __LINE__, 4, &col_vedge, &col_vindex, &col_hvertex, &col_hindex); return ierr; }
int Zoltan_PHG_CoarsePartition( ZZ *zz, HGraph *phg, /* Input: coarse hypergraph -- distributed! */ int numPart, /* Input: number of partitions to generate. */ float *part_sizes, /* Input: array of size numPart listing target sizes (% of work) for the partitions */ Partition part, /* Input: array of initial partition assignments. Output: array of computed partition assignments. */ PHGPartParams *hgp /* Input: parameters to use. */ ) { /* * Zoltan_PHG_CoarsePartition computes a partitioning of a hypergraph. * Typically, this routine is called at the bottom level in a * multilevel scheme (V-cycle). * It gathers the distributed hypergraph to each processor and computes * a decomposition of the serial hypergraph. * It computes a different partition on each processor * using different random numbers (and possibly also * different algorithms) and selects the best. */ char *yo = "Zoltan_PHG_CoarsePartition"; int ierr = ZOLTAN_OK; int i, si, j; static PHGComm scomm; /* Serial communicator info */ static int first_time = 1; HGraph *shg = NULL; /* Serial hypergraph gathered from phg */ int *spart = NULL; /* Partition vectors for shg. */ int *new_part = NULL; /* Ptr to new partition vector. */ float *bestvals = NULL; /* Best cut values found so far */ int worst, new_cand; float bal, cut, worst_cut; int fine_timing = (hgp->use_timers > 2); struct phg_timer_indices *timer = Zoltan_PHG_LB_Data_timers(zz); int local_coarse_part = hgp->LocalCoarsePartition; /* Number of iterations to try coarse partitioning on each proc. */ /* 10 when p=1, and 1 when p is large. */ const int num_coarse_iter = 1 + 9/zz->Num_Proc; ZOLTAN_TRACE_ENTER(zz, yo); if (fine_timing) { if (timer->cpgather < 0) timer->cpgather = Zoltan_Timer_Init(zz->ZTime, 1, "CP Gather"); if (timer->cprefine < 0) timer->cprefine = Zoltan_Timer_Init(zz->ZTime, 0, "CP Refine"); if (timer->cpart < 0) timer->cpart = Zoltan_Timer_Init(zz->ZTime, 0, "CP Part"); ZOLTAN_TIMER_START(zz->ZTime, timer->cpart, phg->comm->Communicator); } /* Force LocalCoarsePartition if large global graph */ #define LARGE_GRAPH_VTX 64000 #define LARGE_GRAPH_PINS 256000 if (phg->dist_x[phg->comm->nProc_x] > LARGE_GRAPH_VTX){ /* TODO: || (global_nPins > LARGE_GRAPH_PINS) */ local_coarse_part = 1; } /* take care of all special cases first */ if (!strcasecmp(hgp->coarsepartition_str, "no") || !strcasecmp(hgp->coarsepartition_str, "none")) { /* Do no coarse partitioning. */ /* Do a sanity test and mapping to parts [0,...,numPart-1] */ int first = 1; PHGComm *hgc=phg->comm; Zoltan_Srand_Sync (Zoltan_Rand(NULL), &(hgc->RNGState_col), hgc->col_comm); if (hgp->UsePrefPart) { for (i = 0; i < phg->nVtx; i++) { /* Impose fixed vertex/preferred part constraints. */ if (phg->pref_part[i] < 0) { /* Free vertex in fixedvertex partitioning or repart */ /* randomly assigned to a part */ part[i] = Zoltan_Rand_InRange(&(hgc->RNGState_col), numPart); } else { if (phg->bisec_split < 0) /* direct k-way, use part numbers directly */ part[i] = phg->pref_part[i]; else /* recursive bisection, map to 0-1 part numbers */ part[i] = (phg->pref_part[i] < phg->bisec_split ? 0 : 1); } } } else { for (i = 0; i < phg->nVtx; i++) { if (part[i] >= numPart || part[i]<0) { if (first) { ZOLTAN_PRINT_WARN(zz->Proc, yo, "Initial part number > numParts."); first = 0; ierr = ZOLTAN_WARN; } part[i] = ((part[i]<0) ? -part[i] : part[i]) % numPart; } } } } else if (numPart == 1) { /* everything goes in the one partition */ for (i = 0; i < phg->nVtx; i++) part[i] = 0; } else if (!hgp->UsePrefPart && numPart >= phg->dist_x[phg->comm->nProc_x]) { /* more partitions than vertices, trivial answer */ for (i = 0; i < phg->nVtx; i++) part[i] = phg->dist_x[phg->comm->myProc_x]+i; } else if (local_coarse_part) { /* Apply local partitioner to each column */ ierr = local_coarse_partitioner(zz, phg, numPart, part_sizes, part, hgp, hgp->CoarsePartition); } else { /* Normal case: * Gather distributed HG to each processor; * compute different partitioning on each processor; * select the "best" result. */ ZOLTAN_PHG_COARSEPARTITION_FN *CoarsePartition; /* Select different coarse partitioners for processors here. */ CoarsePartition = hgp->CoarsePartition; if (CoarsePartition == NULL) { /* auto */ /* Select a coarse partitioner from the array of coarse partitioners */ CoarsePartition = CoarsePartitionFns[phg->comm->myProc % NUM_COARSEPARTITION_FNS]; } if (phg->comm->nProc == 1) { /* Serial and parallel hgraph are the same. */ shg = phg; } else { /* Set up a serial communication struct for gathered HG */ if (first_time) { scomm.nProc_x = scomm.nProc_y = 1; scomm.myProc_x = scomm.myProc_y = 0; scomm.Communicator = MPI_COMM_SELF; scomm.row_comm = MPI_COMM_SELF; scomm.col_comm = MPI_COMM_SELF; scomm.myProc = 0; scomm.nProc = 1; first_time = 0; } scomm.RNGState = Zoltan_Rand(NULL); scomm.RNGState_row = Zoltan_Rand(NULL); scomm.RNGState_col = Zoltan_Rand(NULL); scomm.zz = zz; /* * Gather parallel hypergraph phg to each processor, creating * serial hypergraph shg. */ if (fine_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->cpart, phg->comm->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->cpgather, phg->comm->Communicator); } ierr = Zoltan_PHG_Gather_To_All_Procs(zz, phg, hgp, &scomm, &shg); if (ierr < 0) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Error returned from gather."); goto End; } if (fine_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->cpgather, phg->comm->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->cpart, phg->comm->Communicator); } } /* * Allocate partition array spart for the serial hypergraph shg * and partition shg. */ spart = (int *) ZOLTAN_CALLOC(shg->nVtx * (NUM_PART_KEEP+1), sizeof(int)); bestvals = (float *) ZOLTAN_MALLOC((NUM_PART_KEEP+1)*sizeof(int)); if ((!spart) || (!bestvals)) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Out of memory."); ierr = ZOLTAN_MEMERR; goto End; } /* Compute several coarse partitionings. */ /* Keep the NUM_PART_KEEP best ones around. */ /* Currently, only the best one is used. */ /* Set RNG so different procs compute different parts. */ Zoltan_Srand(Zoltan_Rand(NULL) + zz->Proc, NULL); new_cand = 0; new_part = spart; for (i=0; i< num_coarse_iter; i++){ int savefmlooplimit=hgp->fm_loop_limit; /* Overwrite worst partition with new candidate. */ ierr = CoarsePartition(zz, shg, numPart, part_sizes, new_part, hgp); if (ierr < 0) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Error returned from CoarsePartition."); goto End; } /* time refinement step in coarse partitioner */ if (fine_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->cpart, phg->comm->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->cprefine, phg->comm->Communicator); } /* UVCUVC: Refine new candidate: only one pass is enough. */ hgp->fm_loop_limit = 1; Zoltan_PHG_Refinement(zz, shg, numPart, part_sizes, new_part, hgp); hgp->fm_loop_limit = savefmlooplimit; /* stop refinement timer */ if (fine_timing) { ZOLTAN_TIMER_STOP(zz->ZTime, timer->cprefine, phg->comm->Communicator); ZOLTAN_TIMER_START(zz->ZTime, timer->cpart, phg->comm->Communicator); } /* Decide if candidate is in the top tier or not. */ /* Our objective is a combination of cuts and balance */ bal = Zoltan_PHG_Compute_Balance(zz, shg, part_sizes, 0, numPart, new_part); cut = Zoltan_PHG_Compute_ConCut(shg->comm, shg, new_part, numPart, &ierr); /* Use ratio-cut as our objective. There are many other options! */ bestvals[new_cand] = cut/(MAX(2.-bal, 0.0001)); /* avoid divide-by-0 */ if (ierr < 0) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Error returned from Zoltan_PHG_Compute_ConCut."); goto End; } if (i<NUM_PART_KEEP) new_cand = i+1; else { /* find worst partition vector, to overwrite it */ /* future optimization: keep bestvals sorted */ worst = 0; worst_cut = bestvals[0]; for (j=1; j<NUM_PART_KEEP+1; j++){ if (worst_cut < bestvals[j]){ worst_cut = bestvals[j]; worst = j; } } new_cand = worst; } new_part = spart+new_cand*(shg->nVtx); } /* Copy last partition vector such that all the best ones are contiguous starting at spart. */ for (i=0; i<shg->nVtx; i++){ new_part[i] = spart[NUM_PART_KEEP*(shg->nVtx)+i]; } /* Also update bestvals */ bestvals[new_cand] = bestvals[NUM_PART_KEEP]; /* Evaluate and select the best. */ /* For now, only pick the best one, in the future we pick the k best. */ ierr = pick_best(zz, hgp, phg->comm, shg, numPart, MIN(NUM_PART_KEEP, num_coarse_iter), spart, bestvals); if (ierr < 0) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Error returned from pick_best."); goto End; } if (phg->comm->nProc > 1) { /* Map gathered partition back to 2D distribution */ for (i = 0; i < phg->nVtx; i++) { /* KDDKDD Assume vertices in serial HG are ordered by GNO of phg */ si = VTX_LNO_TO_GNO(phg, i); part[i] = spart[si]; } Zoltan_HG_HGraph_Free(shg); ZOLTAN_FREE(&shg); } else { /* single processor */ for (i = 0; i < phg->nVtx; i++) part[i] = spart[i]; } ZOLTAN_FREE(&spart); ZOLTAN_FREE(&bestvals); } End: if (fine_timing) ZOLTAN_TIMER_STOP(zz->ZTime, timer->cpart, phg->comm->Communicator); ZOLTAN_TRACE_EXIT(zz, yo); return ierr; }