Пример #1
0
int main(int argc, char *argv[])
{
    int rc, i, ngids, maxcol, ncolors;
    float ver;
    struct Zoltan_Struct *zz=NULL;
#ifdef ZOLTANV31
    int numGidEntries, numLidEntries;
#else
    ZOLTAN_GRAPH_EVAL graph;
#endif
    int *color;
    ZOLTAN_ID_PTR gid_list;
    UZData guz, *uz=&guz;
    int msg_tag = 9999;
    int nlvtx, next, maxdeg=0;
    double times[9]={0.,0.,0.,0.,0.,0.,0.,0.}; /* Used for timing measurements */
    double gtimes[9]={0.,0.,0.,0.,0.,0.,0.,0.}; /* Used for timing measurements */
    
    /******************************************************************
     ** Initialize MPI and Zoltan
     ******************************************************************/

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &uz->myRank);
    MPI_Comm_size(MPI_COMM_WORLD, &uz->numProcs);

    MPI_Barrier(MPI_COMM_WORLD);
    times[0] = u_wseconds();
    
    rc = Zoltan_Initialize(argc, argv, &ver);
    if (rc != ZOLTAN_OK){
        fprintf(stderr, "Sorry Zoltan initialize failed...\n");
        goto End;
    }
    zz = Zoltan_Create(MPI_COMM_WORLD);

    if (argc<3 && !uz->myRank) {
        fprintf(stderr, "usage: %s [meshR] [meshC] [X-point stencil] [procR] [procC] [ws-beta] [<ZoltanParam>=<Val>] ...\n\n", argv[0]);
        fprintf(stderr, "ws-beta: is the probablity of adding an edge to a vertex to generate Watts-Strogatz graphs\n");
        fprintf(stderr, "Valid values for Stencil are 5, 7 and 9\n");
        fprintf(stderr, "Zoltan Coloring Parameters and values are\n");
        fprintf(stderr, "\tDISTANCE        : 1 or 2\n");
        fprintf(stderr, "\tSUPERSTEP_SIZE  : suggested >= 100\n"); 
        fprintf(stderr, "\tCOMM_PATTERN    : S or A\n");
        fprintf(stderr, "\tCOLOR_ORDER     : I, B, U\n");
        fprintf(stderr, "\tCOLORING_METHOD : F (for now)\n");
        fprintf(stderr, "\n");
    }

    uz->procR = uz->procC = 0;
    uz->meshR = uz->meshC = 1024;
    uz->stencil = 9;

    if (argc>1)
        uz->meshR = atoi(argv[1]);
    if (argc>2)
        uz->meshC = atoi(argv[2]);
    if (argc>3)
        uz->stencil = atoi(argv[3]);
    if (uz->stencil!=5 && uz->stencil!=7 && uz->stencil!=9) {
        fprintf(stderr, "\t invalid stencil value. Valid values are 5, 7 and 9. Assumed 9.\n");
        uz->stencil = 9;
    }
    --uz->stencil;

    if (argc>4)
        uz->procR = atoi(argv[4]);
    if (argc>5)
        uz->procC = atoi(argv[5]);
    if (uz->procR <= 0 || uz->procC <= 0)
        computeProcMesh(uz);
    
    if (uz->procR*uz->procC!=uz->numProcs) {
        fprintf(stderr, "#Procs=%d but requested %dx%d Proc Mesh Partitioning...\n", uz->numProcs, uz->procR, uz->procC);
        goto End;
    }

    if (argc>6)
        uz->beta = atof(argv[6]);
    else
        uz->beta = 0.0;
    
    /* compute which part of mesh I will compute */
    uz->myR = uz->myRank / uz->procC;
    uz->myC = uz->myRank % uz->procC;

    uz->_sr = uz->myR * (uz->meshR / uz->procR);
    uz->_er = (uz->myR+1) * (uz->meshR / uz->procR);
    if (uz->_er>uz->meshR)
        uz->_er = uz->meshR;
    uz->_sc = uz->myC * (uz->meshC / uz->procC);
    uz->_ec = (uz->myC+1) * (uz->meshC / uz->procC);
    if (uz->_ec>uz->meshC)
        uz->_ec = uz->meshC;


    if ( (uz->meshR % uz->procR) !=0 || (uz->meshC % uz->procC)!=0) {
        printf("Mesh dimensions are not divisible with proc mesh.\nRequested mesh is %dx%d and proc mesh is %d x %d\n", uz->meshR, uz->meshC, uz->procR, uz->procC);
        exit(1);
    }
    nlvtx= (uz->_er-uz->_sr) * (uz->_ec-uz->_sc);

    if (uz->myRank==0)
        printf("Running %s on %d x %d processor mesh, generating %d-point %d x %d mesh with beta=%.3lf\n", argv[0], uz->procR, uz->procC, uz->stencil+1, uz->meshR, uz->meshC, uz->beta);

    times[1] = u_wseconds();    
    uz->numredge = 0;
    uz->redgeto = NULL;
    if (uz->beta>0) { /* create random edges for WS graph */
        int ngvtx=uz->meshC*uz->meshR, trsh=(int) (uz->beta*100.0);
        int ierr=0;
        int *edges=NULL, *redges=NULL, *proclist=NULL, nedge;
        ZOLTAN_COMM_OBJ *plan;
            
        uz->redgeto = (int *) malloc(nlvtx*sizeof(int));
        for (i=0; i<nlvtx; ++i) {
            int rv = Zoltan_Rand_InRange(NULL, 100);
            if ( rv < trsh) {
                if ((uz->redgeto[i] = Zoltan_Rand_InRange(NULL,  ngvtx))==gIDfLID(i)) /* is it a self edge */
                    uz->redgeto[i] = -1;
                else 
                    ++uz->numredge;
            } else
                uz->redgeto[i] = -1;
        }

        edges = (int *) malloc(sizeof(int)*2*uz->numredge);
        proclist = (int *) malloc(sizeof(int)*uz->numredge);
        next = 0;
        for (i=0; i<nlvtx; ++i)
            if (uz->redgeto[i]>0) {
                edges[2*next] = uz->redgeto[i];
                edges[2*next+1] = gIDfLID(i);
                proclist[next] = pIDfGID(uz->redgeto[i]);
                ++next;
            }

        ierr = Zoltan_Comm_Create(&plan, uz->numredge, proclist, MPI_COMM_WORLD,
                                  msg_tag, &nedge);
        
        redges = (int *) malloc(sizeof(int)*2*nedge);
        
        --msg_tag;
        ierr |= Zoltan_Comm_Do(plan, msg_tag, (char *) edges, 2*sizeof(int),
                               (char *) redges);
        ierr |= Zoltan_Comm_Destroy(&plan);
        free(proclist);
        free(edges);
                
        if (ierr) {
            printf("error while communicating edges!\n");
            exit(1);
        }
        xadj = (int *) calloc(1+nlvtx, sizeof(int));
        adj = (int *) malloc(sizeof(int)*nedge);
        for (i=0; i<nedge; ++i)  {
            if (redges[2*i] < gID(uz->_sr, uz->_sc) || redges[2*i] >= gID(uz->_er, uz->_ec)) {
                printf("[%d/%d] received gid=%d doesn't blong to processor range [%d, %d) should go to proc %d\n", uz->myRank, uz->numProcs, redges[2*i],  gID(uz->_sr, uz->_sc), gID(uz->_er, uz->_ec), pIDfGID(redges[2*i]));
            }
                
            ++xadj[lIDfGID(redges[2*i])];
        }
        xadj[nlvtx] = nedge;
        maxdeg = xadj[0];
        for (i=1; i<nlvtx; ++i) {
            maxdeg = xadj[i]>maxdeg ? xadj[i] : maxdeg;
            xadj[i] += xadj[i-1];
        }
        
        for (i=0; i<nedge; ++i) {
            int u = lIDfGID(redges[2*i]);
            int v = redges[2*i+1];
            adj[--xadj[u]] = v;
        }
        free(redges);                    
    }
    maxdeg += uz->stencil+1;
    adjTemp = (int *) malloc(sizeof(int)*2*maxdeg);
    times[2] = u_wseconds();
    
    /*
      printf("My rank %d/%d at proc-mesh loc (%d, %d) generating [%d, %d) x [%d, %d) + %d random edges TotEdge=%d\n", uz->myRank, uz->numProcs, uz->myR, uz->myC, uz->_sr, uz->_er, uz->_sc, uz->_ec, uz->numredge, xadj[nlvtx]);  */
    printStats("Number of Vertices  ", nlvtx, uz->myRank, uz->numProcs);
    if (xadj)
        printStats("Number of Rand Edges", xadj[nlvtx], uz->myRank, uz->numProcs);

    
    /* General parameters */
#ifndef ZOLTANV31
    Zoltan_Set_Param(zz, "GRAPH_BUILD_TYPE", "FAST_NO_DUP");
#endif

    /* General parameters */
    Zoltan_Set_Param(zz, "DEBUG_LEVEL", "3");
    Zoltan_Set_Param(zz, "NUM_GID_ENTRIES", "1"); 
    Zoltan_Set_Param(zz, "NUM_LID_ENTRIES", "1");
    Zoltan_Set_Param(zz, "OBJ_WEIGHT_DIM", "0");


    /* coloring parameters */
    Zoltan_Set_Param(zz, "SUPERSTEP_SIZE", "500"); /* let's make S=500 default */
    for (i=7; i<argc; ++i) {
        char param[256], *eq;

        if (!uz->myRank)
            printf("processing argv[%d]='%s'\n", i, argv[i]);
        strncpy(param, argv[i], sizeof(param));
        eq = strchr(param, '=');
        if (!eq) {
            fprintf(stderr, "invalid argument '%s', Zoltan Paramters should be in the format <ZoltanParam>=<Val>\n", param);
            goto End;
        }
        *eq = 0;
        Zoltan_Set_Param(zz, param, eq+1);
    }


#if 0    
    /* Graph parameters */
    Zoltan_Set_Param(zz, "CHECK_GRAPH", "2");
#endif

    /* set call backs */
    Zoltan_Set_Num_Obj_Fn(zz, get_number_of_objects, uz);
    Zoltan_Set_Obj_List_Fn(zz, get_object_list, uz);
    Zoltan_Set_Num_Edges_Multi_Fn(zz, get_num_edges_list, uz);
    Zoltan_Set_Edge_List_Multi_Fn(zz, get_edge_list, uz);

#if 0    
#ifndef ZOLTANV31
    Zoltan_LB_Eval_Graph(zz, 0, &graph);

    if (!uz->myRank) {
        printf("EdgeCut   Min=%8.0f  Max=%8.0f  Sum=%8.0f\n", graph.cuts[EVAL_GLOBAL_MIN], graph.cuts[EVAL_GLOBAL_MAX], graph.cuts[EVAL_GLOBAL_SUM]);
        printf("#Vertices Min=%8.0f  Max=%8.0f  Sum=%8.0f imbal=%.2f\n", graph.nobj[EVAL_GLOBAL_MIN], graph.nobj[EVAL_GLOBAL_MAX], graph.nobj[EVAL_GLOBAL_SUM], graph.obj_imbalance);        
    }
#endif
#endif

    /* now color */
    ngids = get_number_of_objects(uz, &rc);

    gid_list = (ZOLTAN_ID_PTR) malloc(sizeof(ZOLTAN_ID_TYPE) * ngids);
#ifndef ZOLTANV31
    next = 0;
    for (i=uz->_sr; i<uz->_er; ++i) {
        int j;
        for (j=uz->_sc; j<uz->_ec; ++j) {
            gid_list[next++] = i*uz->meshC + j;
        }
    }    
#endif
    color = (int *) malloc(sizeof(int) * ngids);    

    MPI_Barrier(MPI_COMM_WORLD);
    times[3] = u_wseconds();    
#ifdef ZOLTANV31
    rc = Zoltan_Color(zz, /* input (all remaining fields are output) */
                      &numGidEntries,  /* Number of integers used for a global ID */
                      &numLidEntries,  /* Number of integers used for a local ID */
                      ngids,           /* #objects to color in this proc */
                      gid_list,        /* global ids of colored vertices */
                      NULL,            /* we ignore local ids */
                      color);          /* result color */    
#else    
    rc = Zoltan_Color(zz, /* input (all remaining fields are output) */
                      1,  /* Number of integers used for a global ID */
                      ngids,           /* #objects to color in this proc */
                      gid_list,        /* global ids of colored vertices */
                      color);          /* result color */
#endif
    MPI_Barrier(MPI_COMM_WORLD);
    times[4] = u_wseconds();
    MPI_Reduce(times, gtimes, 5, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);


    if (rc != ZOLTAN_OK) 
        fprintf(stderr, "Zoltan_Color failed with return code %d...\n", rc);

    for (maxcol=i=0; i<ngids; ++i)
        if (color[i] > maxcol)
            maxcol = color[i];
    MPI_Reduce(&maxcol, &ncolors, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
    if (uz->myRank==0) {
        struct rusage usage;
                
        printf("%s setup             Proc-0: %8.2lf   Max: %8.2lf\n", argv[0], times[1]-times[0], gtimes[1]-gtimes[0]);
        printf("%s gen rand edges    Proc-0: %8.2lf   Max: %8.2lf\n", argv[0], times[2]-times[1], gtimes[2]-gtimes[1]);
        printf("%s set gids          Proc-0: %8.2lf   Max: %8.2lf\n", argv[0], times[3]-times[2], gtimes[3]-gtimes[2]);
        printf("%s Zoltan_Color call Proc-0: %8.2lf   Max: %8.2lf\n", argv[0], times[4]-times[3], gtimes[4]-gtimes[3]);
        printf("%s Coloring Time    : %.2lf   # Colors used : %d\n", argv[0], gtimes[4]-gtimes[0], ncolors);
        getrusage(RUSAGE_SELF, &usage);
        printf("%s maxrss=%ld minflt=%ld majflt=%ld nswap=%ld\n", argv[0], usage.ru_maxrss, usage.ru_minflt, usage.ru_majflt, usage.ru_nswap);                
    }
    
#ifdef _DEBUG
    saveColor(argv[0], uz, (int *) gid_list, color, ngids);
#endif

    /******************************************************************
     ** Clean up
     ******************************************************************/

    if (gid_list)
        free(gid_list);
    if (color)
        free(color);
    if (xadj)
        free(xadj);
    if (adj)
        free(adj);
    if (adjTemp)
        free(adjTemp);
    if (uz->redgeto)
        free(uz->redgeto);

End:    
    Zoltan_Destroy(&zz);
    MPI_Finalize();

    return 0;
}
Пример #2
0
int Zoltan_Random(
  ZZ *zz,                       /* The Zoltan structure.                     */
  float *part_sizes,            /* Input:  Array of size zz->LB.Num_Global_Parts
                                   * zz->Obj_Weight_Dim
                                   containing the percentage of work to be
                                   assigned to each partition.               */
  int *num_import,              /* Return -1. Random uses only export lists. */
  ZOLTAN_ID_PTR *import_global_ids, /* Not used. */
  ZOLTAN_ID_PTR *import_local_ids,  /* Not used. */
  int **import_procs,           /* Not used. */
  int **import_to_part,         /* Not used. */
  int *num_export,              /* Output: Number of objects to export. */
  ZOLTAN_ID_PTR *export_global_ids, /* Output: GIDs to export. */
  ZOLTAN_ID_PTR *export_local_ids,  /* Output: LIDs to export. */
  int **export_procs,           /* Output: Processsors to export to. */
  int **export_to_part          /* Output: Partitions to export to. */
)
{
  int ierr = ZOLTAN_OK;
  int i, count, num_obj;
  int max_export;
  double rand_frac = 1.0;       /* Default is to move all objects. */
  ZOLTAN_ID_PTR global_ids = NULL;
  ZOLTAN_ID_PTR local_ids = NULL; 
  int *parts = NULL;
  float *dummy = NULL;
  static char *yo = "Zoltan_Random";
  static int first_time = 1;

  ZOLTAN_TRACE_ENTER(zz, yo);

  /* Synchronize the random number generator. 
   * This synchronization is needed only for sanity in our nightly testing. 
   * If some other operation (eg., Zoltan_LB_Eval) changes the status of 
   * the random number generator, the answers here will change.  They won't
   * be wrong, but they will be different from our accepted answers.
   */
  if (first_time) {
    Zoltan_Srand(zz->Seed, NULL);
    Zoltan_Rand(NULL);
    first_time=0;
  }

  /* No import lists computed. */
  *num_import = -1;

  /* Get parameter values. */
  Zoltan_Bind_Param(Random_params, "RANDOM_MOVE_FRACTION", (void *) &rand_frac);
  Zoltan_Assign_Param_Vals(zz->Params, Random_params, zz->Debug_Level, 
                           zz->Proc, zz->Debug_Proc);

  /* Get list of local objects. */
  ierr = Zoltan_Get_Obj_List(zz, &num_obj, &global_ids, &local_ids, 0,
                             &dummy, &parts);

  /* Bound number of objects to export. */
  max_export = 1.5*rand_frac*num_obj;

  /* Allocate export lists. */
  *export_global_ids = *export_local_ids = NULL;
  *export_procs = *export_to_part = NULL;
  if (max_export > 0) {
    if (!Zoltan_Special_Malloc(zz, (void **)export_global_ids, max_export,
                               ZOLTAN_SPECIAL_MALLOC_GID)
     || !Zoltan_Special_Malloc(zz, (void **)export_local_ids, max_export,
                               ZOLTAN_SPECIAL_MALLOC_LID)
     || !Zoltan_Special_Malloc(zz, (void **)export_procs, max_export,
                               ZOLTAN_SPECIAL_MALLOC_INT)
     || !Zoltan_Special_Malloc(zz, (void **)export_to_part, max_export,
                               ZOLTAN_SPECIAL_MALLOC_INT)) {
      ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error.");
      ierr = ZOLTAN_MEMERR;
      goto End;
    }
  }

  /* Randomly assign ids to procs. */
  count=0;
  for (i=0; i<num_obj; i++){
    /* Randomly select some objects to move (export) */
    if ((count<max_export) && (Zoltan_Rand(NULL)<rand_frac*ZOLTAN_RAND_MAX)){
      /* export_global_ids[count] = global_ids[i]; */
      ZOLTAN_SET_GID(zz, &((*export_global_ids)[count*zz->Num_GID]),
                     &global_ids[i*zz->Num_GID]);
      if (local_ids)
        /* export_local_ids[count] = local_ids[i]; */
        ZOLTAN_SET_LID(zz, &((*export_local_ids)[count*zz->Num_LID]),
                       &local_ids[i*zz->Num_LID]);
      /* Randomly pick new partition number. */
      (*export_to_part)[count] = Zoltan_Rand_InRange(NULL, zz->LB.Num_Global_Parts);
      /* Processor number is derived from partition number. */
      (*export_procs)[count] = Zoltan_LB_Part_To_Proc(zz, 
                     (*export_to_part)[count], &global_ids[i*zz->Num_GID]);

      /* printf("Debug: Export gid %u to part %d and proc %d.\n", (*export_global_ids)[count], (*export_to_part)[count], (*export_procs)[count]); */

      ++count;
    }
  }
  (*num_export) = count;

End:
  /* Free local memory, but not export lists. */
  ZOLTAN_FREE(&global_ids);
  ZOLTAN_FREE(&local_ids);
  ZOLTAN_FREE(&parts);

  ZOLTAN_TRACE_EXIT(zz, yo);
  return ierr;
}
Пример #3
0
int Zoltan_PHG_CoarsePartition(
  ZZ *zz, 
  HGraph *phg,         /* Input:  coarse hypergraph -- distributed! */
  int numPart,         /* Input:  number of partitions to generate. */
  float *part_sizes,   /* Input:  array of size numPart listing target sizes
                                  (% of work) for the partitions */
  Partition part,      /* Input:  array of initial partition assignments.
                          Output: array of computed partition assignments.   */
  PHGPartParams *hgp   /* Input:  parameters to use.  */
)
{
/* 
 * Zoltan_PHG_CoarsePartition computes a partitioning of a hypergraph.
 * Typically, this routine is called at the bottom level in a
 * multilevel scheme (V-cycle).
 * It gathers the distributed hypergraph to each processor and computes
 * a decomposition of the serial hypergraph.  
 * It computes a different partition on each processor
 * using different random numbers (and possibly also
 * different algorithms) and selects the best.
 */
char *yo = "Zoltan_PHG_CoarsePartition";
int ierr = ZOLTAN_OK;
int i, si, j;
static PHGComm scomm;          /* Serial communicator info */
static int first_time = 1;
HGraph *shg = NULL;            /* Serial hypergraph gathered from phg */
int *spart = NULL;             /* Partition vectors for shg. */
int *new_part = NULL;          /* Ptr to new partition vector. */
float *bestvals = NULL;        /* Best cut values found so far */
int worst, new_cand;
float bal, cut, worst_cut;
int fine_timing = (hgp->use_timers > 2);
struct phg_timer_indices *timer = Zoltan_PHG_LB_Data_timers(zz);
int local_coarse_part = hgp->LocalCoarsePartition;

/* Number of iterations to try coarse partitioning on each proc. */
/* 10 when p=1, and 1 when p is large. */
const int num_coarse_iter = 1 + 9/zz->Num_Proc; 


  ZOLTAN_TRACE_ENTER(zz, yo);

  if (fine_timing) {
    if (timer->cpgather < 0)
      timer->cpgather = Zoltan_Timer_Init(zz->ZTime, 1, "CP Gather");
    if (timer->cprefine < 0)
      timer->cprefine = Zoltan_Timer_Init(zz->ZTime, 0, "CP Refine");
    if (timer->cpart < 0)
      timer->cpart = Zoltan_Timer_Init(zz->ZTime, 0, "CP Part");

    ZOLTAN_TIMER_START(zz->ZTime, timer->cpart, phg->comm->Communicator);
  }


  /* Force LocalCoarsePartition if large global graph */
#define LARGE_GRAPH_VTX   64000
#define LARGE_GRAPH_PINS 256000
  if (phg->dist_x[phg->comm->nProc_x] > LARGE_GRAPH_VTX){
    /* TODO: || (global_nPins > LARGE_GRAPH_PINS) */
    local_coarse_part = 1;
  }

  /* take care of all special cases first */

  if (!strcasecmp(hgp->coarsepartition_str, "no")
      || !strcasecmp(hgp->coarsepartition_str, "none")) {
    /* Do no coarse partitioning. */
    /* Do a sanity test and  mapping to parts [0,...,numPart-1] */
    int first = 1;
    PHGComm *hgc=phg->comm;    

    Zoltan_Srand_Sync (Zoltan_Rand(NULL), &(hgc->RNGState_col), hgc->col_comm);
    if (hgp->UsePrefPart) {
        for (i = 0; i < phg->nVtx; i++) {
            /* Impose fixed vertex/preferred part constraints. */
            if (phg->pref_part[i] < 0) { /* Free vertex in fixedvertex partitioning or repart */
                /* randomly assigned to a part */
                part[i] = Zoltan_Rand_InRange(&(hgc->RNGState_col), numPart);
            } else {
                if (phg->bisec_split < 0)
                    /* direct k-way, use part numbers directly */
                    part[i] = phg->pref_part[i];
                else
                    /* recursive bisection, map to 0-1 part numbers */
                    part[i] = (phg->pref_part[i] < phg->bisec_split ? 0 : 1);
            }            
        }
    } else {
        for (i = 0; i < phg->nVtx; i++) {
            if (part[i] >= numPart || part[i]<0) {
                if (first) {
                    ZOLTAN_PRINT_WARN(zz->Proc, yo, "Initial part number > numParts.");
                    first = 0;
                    ierr = ZOLTAN_WARN;
                }
                part[i] = ((part[i]<0) ? -part[i] : part[i]) % numPart;
            }        
        }
    }
  }
  else if (numPart == 1) {            
    /* everything goes in the one partition */
    for (i =  0; i < phg->nVtx; i++)
      part[i] = 0;
  }
  else if (!hgp->UsePrefPart && numPart >= phg->dist_x[phg->comm->nProc_x]) { 
    /* more partitions than vertices, trivial answer */
    for (i = 0; i < phg->nVtx; i++)
      part[i] = phg->dist_x[phg->comm->myProc_x]+i;
  }
  else if (local_coarse_part) {
    /* Apply local partitioner to each column */
    ierr = local_coarse_partitioner(zz, phg, numPart, part_sizes, part, hgp,
                                    hgp->CoarsePartition);
  }
  else {
    /* Normal case:
     * Gather distributed HG to each processor;
     * compute different partitioning on each processor;
     * select the "best" result.
     */
    ZOLTAN_PHG_COARSEPARTITION_FN *CoarsePartition;

    /* Select different coarse partitioners for processors here. */

    CoarsePartition = hgp->CoarsePartition;
    if (CoarsePartition == NULL) { /* auto */
      /* Select a coarse partitioner from the array of coarse partitioners */
      CoarsePartition = CoarsePartitionFns[phg->comm->myProc % 
                                           NUM_COARSEPARTITION_FNS];
    }


    if (phg->comm->nProc == 1) {
      /* Serial and parallel hgraph are the same. */
      shg = phg;
    }
    else {
      /* Set up a serial communication struct for gathered HG */

      if (first_time) {
        scomm.nProc_x = scomm.nProc_y = 1;
        scomm.myProc_x = scomm.myProc_y = 0;
        scomm.Communicator = MPI_COMM_SELF;
        scomm.row_comm = MPI_COMM_SELF;
        scomm.col_comm = MPI_COMM_SELF;
        scomm.myProc = 0;
        scomm.nProc = 1;
        first_time = 0;
      }
      scomm.RNGState = Zoltan_Rand(NULL);
      scomm.RNGState_row = Zoltan_Rand(NULL);
      scomm.RNGState_col = Zoltan_Rand(NULL);
      scomm.zz = zz;

      /* 
       * Gather parallel hypergraph phg to each processor, creating
       * serial hypergraph shg.
       */
      if (fine_timing) {
        ZOLTAN_TIMER_STOP(zz->ZTime, timer->cpart, phg->comm->Communicator);
        ZOLTAN_TIMER_START(zz->ZTime, timer->cpgather, phg->comm->Communicator);
      }

      ierr = Zoltan_PHG_Gather_To_All_Procs(zz, phg, hgp, &scomm, &shg);
      if (ierr < 0) {
        ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Error returned from gather.");
        goto End;
      }

      if (fine_timing) {
        ZOLTAN_TIMER_STOP(zz->ZTime, timer->cpgather, phg->comm->Communicator);
        ZOLTAN_TIMER_START(zz->ZTime, timer->cpart, phg->comm->Communicator);
      }

    }

    /* 
     * Allocate partition array spart for the serial hypergraph shg
     * and partition shg.
     */
    spart = (int *) ZOLTAN_CALLOC(shg->nVtx * (NUM_PART_KEEP+1),
                                    sizeof(int));
    bestvals = (float *) ZOLTAN_MALLOC((NUM_PART_KEEP+1)*sizeof(int)); 
    if ((!spart) || (!bestvals)) {
      ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Out of memory.");
      ierr = ZOLTAN_MEMERR;
      goto End;
    }
    
    /* Compute several coarse partitionings. */
    /* Keep the NUM_PART_KEEP best ones around. */
    /* Currently, only the best one is used. */

    /* Set RNG so different procs compute different parts. */
    Zoltan_Srand(Zoltan_Rand(NULL) + zz->Proc, NULL);

    new_cand = 0;
    new_part = spart;

    for (i=0; i< num_coarse_iter; i++){
      int savefmlooplimit=hgp->fm_loop_limit;
        
      /* Overwrite worst partition with new candidate. */
      ierr = CoarsePartition(zz, shg, numPart, part_sizes, 
               new_part, hgp);
      if (ierr < 0) {
        ZOLTAN_PRINT_ERROR(zz->Proc, yo, 
                         "Error returned from CoarsePartition.");
        goto End;
      }

      /* time refinement step in coarse partitioner */
      if (fine_timing) {
        ZOLTAN_TIMER_STOP(zz->ZTime, timer->cpart, phg->comm->Communicator);
        ZOLTAN_TIMER_START(zz->ZTime, timer->cprefine, phg->comm->Communicator);
      }

      /* UVCUVC: Refine new candidate: only one pass is enough. */
      hgp->fm_loop_limit = 1;
      Zoltan_PHG_Refinement(zz, shg, numPart, part_sizes, new_part, hgp);
      hgp->fm_loop_limit = savefmlooplimit;
      
      /* stop refinement timer */
      if (fine_timing) {
        ZOLTAN_TIMER_STOP(zz->ZTime, timer->cprefine, phg->comm->Communicator);
        ZOLTAN_TIMER_START(zz->ZTime, timer->cpart, phg->comm->Communicator);
      }

      /* Decide if candidate is in the top tier or not. */
      /* Our objective is a combination of cuts and balance */

      bal = Zoltan_PHG_Compute_Balance(zz, shg, part_sizes, 0, 
                                       numPart, new_part); 
      cut = Zoltan_PHG_Compute_ConCut(shg->comm, shg, new_part, numPart, &ierr);
      
      /* Use ratio-cut as our objective. There are many other options! */
      bestvals[new_cand] = cut/(MAX(2.-bal, 0.0001)); /* avoid divide-by-0 */

      if (ierr < 0) {
        ZOLTAN_PRINT_ERROR(zz->Proc, yo, 
                         "Error returned from Zoltan_PHG_Compute_ConCut.");
        goto End;
      }
      if (i<NUM_PART_KEEP)
        new_cand = i+1;
      else {
        /* find worst partition vector, to overwrite it */
        /* future optimization: keep bestvals sorted */
        worst = 0;
        worst_cut = bestvals[0];
        for (j=1; j<NUM_PART_KEEP+1; j++){
          if (worst_cut < bestvals[j]){
            worst_cut = bestvals[j];
            worst = j;
          }
        }
        new_cand = worst;
      }
      new_part = spart+new_cand*(shg->nVtx);
    }
    /* Copy last partition vector such that all the best ones
       are contiguous starting at spart.                     */
    for (i=0; i<shg->nVtx; i++){
      new_part[i] = spart[NUM_PART_KEEP*(shg->nVtx)+i];
    }
    /* Also update bestvals */
    bestvals[new_cand] = bestvals[NUM_PART_KEEP];

    /* Evaluate and select the best. */
    /* For now, only pick the best one, in the future we pick the k best. */

    ierr = pick_best(zz, hgp, phg->comm, shg, numPart, 
              MIN(NUM_PART_KEEP, num_coarse_iter), spart,
              bestvals);
    if (ierr < 0) {
      ZOLTAN_PRINT_ERROR(zz->Proc, yo, 
                        "Error returned from pick_best.");
      goto End;
    }
  
    if (phg->comm->nProc > 1) {
      /* Map gathered partition back to 2D distribution */
      for (i = 0; i < phg->nVtx; i++) {
        /* KDDKDD  Assume vertices in serial HG are ordered by GNO of phg */
        si = VTX_LNO_TO_GNO(phg, i);
        part[i] = spart[si];
      }

      Zoltan_HG_HGraph_Free(shg);
      ZOLTAN_FREE(&shg);
    } 
    else { /* single processor */
      for (i = 0; i < phg->nVtx; i++)
        part[i] = spart[i];
    }
    ZOLTAN_FREE(&spart);
    ZOLTAN_FREE(&bestvals);
  }
  
End:
  if (fine_timing) 
    ZOLTAN_TIMER_STOP(zz->ZTime, timer->cpart, phg->comm->Communicator);

  ZOLTAN_TRACE_EXIT(zz, yo);
  return ierr;
}