Esempi in C++ (Cpp) per PrintTimingInfo

Esempio n. 1

0

Mostra file

File: gkmetis.c Progetto: KnoooW/gpgpu-sim

/***********************************************************************************
* This function is the entry point of the parallel ordering algorithm.
* This function assumes that the graph is already nice partitioned among the
* processors and then proceeds to perform recursive bisection.
************************************************************************************/
void ParMETIS_V3_PartGeom(idxtype *vtxdist, int *ndims, float *xyz, idxtype *part, MPI_Comm *comm)
{
  int i, npes, mype, nvtxs, firstvtx, dbglvl;
  idxtype *xadj, *adjncy;
  CtrlType ctrl;
  WorkSpaceType wspace;
  GraphType *graph;
  int zeroflg = 0;

  MPI_Comm_size(*comm, &npes);
  MPI_Comm_rank(*comm, &mype);

  if (npes == 1) {
    idxset(vtxdist[mype+1]-vtxdist[mype], 0, part);
    return;
  }

  /* Setup a fake graph to allow the rest of the code to work unchanged */
  dbglvl = 0;

  nvtxs = vtxdist[mype+1]-vtxdist[mype];
  firstvtx = vtxdist[mype];
  xadj = idxmalloc(nvtxs+1, "ParMETIS_PartGeom: xadj");
  adjncy = idxmalloc(nvtxs, "ParMETIS_PartGeom: adjncy");
  for (i=0; i<nvtxs; i++) {
    xadj[i] = i;
    adjncy[i] = firstvtx + (i+1)%nvtxs;
  }
  xadj[nvtxs] = nvtxs;

  /* Proceed with the rest of the code */
  SetUpCtrl(&ctrl, npes, dbglvl, *comm);
  ctrl.seed      = mype;
  ctrl.CoarsenTo = amin(vtxdist[npes]+1, 25*npes);

  graph = Moc_SetUpGraph(&ctrl, 1, vtxdist, xadj, NULL, adjncy, NULL, &zeroflg);

  PreAllocateMemory(&ctrl, graph, &wspace);

  /*=======================================================
   * Compute the initial geometric partitioning
   =======================================================*/
  IFSET(ctrl.dbglvl, DBG_TIME, InitTimers(&ctrl));
  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.TotalTmr));

  Coordinate_Partition(&ctrl, graph, *ndims, xyz, 0, &wspace);

  idxcopy(graph->nvtxs, graph->where, part);

  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.TotalTmr));
  IFSET(ctrl.dbglvl, DBG_TIME, PrintTimingInfo(&ctrl));

  FreeInitialGraphAndRemap(graph, 0);
  FreeWSpace(&wspace);
  FreeCtrl(&ctrl);

  GKfree((void **)&xadj, (void **)&adjncy, LTERM);
}

Esempio n. 2

0

Mostra file

File: ometis.c Progetto: KnoooW/gpgpu-sim

/***********************************************************************************
* This function is the entry point of the parallel ordering algorithm.
* This function assumes that the graph is already nice partitioned among the 
* processors and then proceeds to perform recursive bisection.
************************************************************************************/
void ParMETIS_V3_NodeND(idxtype *vtxdist, idxtype *xadj, idxtype *adjncy, int *numflag,
              int *options, idxtype *order, idxtype *sizes, MPI_Comm *comm)
{
  int i, j;
  int ltvwgts[MAXNCON];
  int nparts, npes, mype, wgtflag = 0, seed = GLOBAL_SEED;
  CtrlType ctrl;
  WorkSpaceType wspace;
  GraphType *graph, *mgraph;
  idxtype *morder;
  int minnvtxs;

  MPI_Comm_size(*comm, &npes);
  MPI_Comm_rank(*comm, &mype);
  nparts = npes;

  if (!ispow2(npes)) {
    if (mype == 0)
      printf("Error: The number of processors must be a power of 2!\n");
    return;
  }

  if (vtxdist[npes] < (int)((float)(npes*npes)*1.2)) {
    if (mype == 0)
      printf("Error: Too many processors for this many vertices.\n");
    return;
  }

  minnvtxs = vtxdist[1]-vtxdist[0];
  for (i=0; i<npes; i++)
    minnvtxs = (minnvtxs < vtxdist[i+1]-vtxdist[i]) ? minnvtxs : vtxdist[i+1]-vtxdist[i];

  if (minnvtxs < (int)((float)npes*1.1)) {
    if (mype == 0)
      printf("Error: vertices are not distributed equally.\n");
    return;
  }
 

  if (*numflag == 1) 
    ChangeNumbering(vtxdist, xadj, adjncy, order, npes, mype, 1);

  SetUpCtrl(&ctrl, nparts, options[PMV3_OPTION_DBGLVL], *comm);
  ctrl.CoarsenTo = amin(vtxdist[npes]+1, 25*npes);

  ctrl.CoarsenTo = amin(vtxdist[npes]+1, 25*amax(npes, nparts));
  ctrl.seed = mype;
  ctrl.sync = seed;
  ctrl.partType = STATIC_PARTITION;
  ctrl.ps_relation = -1;
  ctrl.tpwgts = fsmalloc(nparts, 1.0/(float)(nparts), "tpwgts");
  ctrl.ubvec[0] = 1.03;

  graph = Moc_SetUpGraph(&ctrl, 1, vtxdist, xadj, NULL, adjncy, NULL, &wgtflag);

  PreAllocateMemory(&ctrl, graph, &wspace);

  /*=======================================================
   * Compute the initial k-way partitioning 
   =======================================================*/
  IFSET(ctrl.dbglvl, DBG_TIME, InitTimers(&ctrl));
  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.TotalTmr));

  Moc_Global_Partition(&ctrl, graph, &wspace);

  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.TotalTmr));
  IFSET(ctrl.dbglvl, DBG_TIME, PrintTimingInfo(&ctrl));

  /*=======================================================
   * Move the graph according to the partitioning
   =======================================================*/
  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.MoveTmr));

  MALLOC_CHECK(NULL);
  graph->ncon = 1;
  mgraph = Moc_MoveGraph(&ctrl, graph, &wspace);
  MALLOC_CHECK(NULL);

  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.MoveTmr));

  /*=======================================================
   * Now compute an ordering of the moved graph
   =======================================================*/
  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.TotalTmr));

  FreeWSpace(&wspace);
  PreAllocateMemory(&ctrl, mgraph, &wspace);

  ctrl.ipart = ISEP_NODE;
  ctrl.CoarsenTo = amin(vtxdist[npes]+1, amax(20*npes, 1000));

  /* compute tvwgts */
  for (j=0; j<mgraph->ncon; j++)
    ltvwgts[j] = 0;

  for (i=0; i<mgraph->nvtxs; i++)
    for (j=0; j<mgraph->ncon; j++)
      ltvwgts[j] += mgraph->vwgt[i*mgraph->ncon+j];

  for (j=0; j<mgraph->ncon; j++)
    ctrl.tvwgts[j] = GlobalSESum(&ctrl, ltvwgts[j]);

  mgraph->nvwgt = fmalloc(mgraph->nvtxs*mgraph->ncon, "mgraph->nvwgt");
  for (i=0; i<mgraph->nvtxs; i++)
    for (j=0; j<mgraph->ncon; j++)
      mgraph->nvwgt[i*mgraph->ncon+j] = (float)(mgraph->vwgt[i*mgraph->ncon+j]) / (float)(ctrl.tvwgts[j]);


  morder = idxmalloc(mgraph->nvtxs, "PAROMETIS: morder");
  MultilevelOrder(&ctrl, mgraph, morder, sizes, &wspace);

  MALLOC_CHECK(NULL);

  /* Invert the ordering back to the original graph */
  ProjectInfoBack(&ctrl, graph, order, morder, &wspace);

  MALLOC_CHECK(NULL);

  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.TotalTmr));
  IFSET(ctrl.dbglvl, DBG_TIME, PrintTimingInfo(&ctrl));
  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));

  free(ctrl.tpwgts);
  free(morder);
  FreeGraph(mgraph);
  FreeInitialGraphAndRemap(graph, 0);
  FreeWSpace(&wspace);
  FreeCtrl(&ctrl);

  if (*numflag == 1) 
    ChangeNumbering(vtxdist, xadj, adjncy, order, npes, mype, 0);

  MALLOC_CHECK(NULL);
}

Esempio n. 3

0

Mostra file

File: gkmetis.c Progetto: KnoooW/gpgpu-sim

/***********************************************************************************
* This function is the entry point of the parallel kmetis algorithm that uses
* coordinates to compute an initial graph distribution.
************************************************************************************/
void ParMETIS_V3_PartGeomKway(idxtype *vtxdist, idxtype *xadj, idxtype *adjncy,
              idxtype *vwgt, idxtype *adjwgt, int *wgtflag, int *numflag, int *ndims, 
	      float *xyz, int *ncon, int *nparts, float *tpwgts, float *ubvec, 
	      int *options, int *edgecut, idxtype *part, MPI_Comm *comm)
{
  int h, i, j;
  int nvtxs = -1, npes, mype;
  int uwgtflag, cut, gcut, maxnvtxs;
  int ltvwgts[MAXNCON];
  int moptions[10];
  CtrlType ctrl;
  idxtype *uvwgt;
  WorkSpaceType wspace;
  GraphType *graph, *mgraph;
  float avg, maximb, balance, *mytpwgts;
  int seed, dbglvl = 0;
  int iwgtflag, inumflag, incon, inparts, ioptions[10];
  float *itpwgts, iubvec[MAXNCON];

  MPI_Comm_size(*comm, &npes);
  MPI_Comm_rank(*comm, &mype);

  /********************************/
  /* Try and take care bad inputs */
  /********************************/
  if (options != NULL && options[0] == 1)
    dbglvl = options[PMV3_OPTION_DBGLVL];

  CheckInputs(STATIC_PARTITION, npes, dbglvl, wgtflag, &iwgtflag, numflag, &inumflag,
              ncon, &incon, nparts, &inparts, tpwgts, &itpwgts, ubvec, iubvec, 
	      NULL, NULL, options, ioptions, part, comm);


  /*********************************/
  /* Take care the nparts = 1 case */
  /*********************************/
  if (inparts <= 1) {
    idxset(vtxdist[mype+1]-vtxdist[mype], 0, part);
    *edgecut = 0;
    return;
  }

  /******************************/
  /* Take care of npes = 1 case */
  /******************************/
  if (npes == 1 && inparts > 1) {
    moptions[0] = 0;
    nvtxs = vtxdist[1];

    if (incon == 1) {
      METIS_WPartGraphKway(&nvtxs, xadj, adjncy, vwgt, adjwgt, &iwgtflag, &inumflag, 
            &inparts, itpwgts, moptions, edgecut, part);
    }
    else {
      /* ADD: this is because METIS does not support tpwgts for all constraints */
      mytpwgts = fmalloc(inparts, "mytpwgts");
      for (i=0; i<inparts; i++)
        mytpwgts[i] = itpwgts[i*incon];

      moptions[7] = -1;
      METIS_mCPartGraphRecursive2(&nvtxs, &incon, xadj, adjncy, vwgt, adjwgt, &iwgtflag, 
            &inumflag, &inparts, mytpwgts, moptions, edgecut, part);

      free(mytpwgts);
    }

    return;
  }


  if (inumflag == 1)
    ChangeNumbering(vtxdist, xadj, adjncy, part, npes, mype, 1);

  /*****************************/
  /* Set up control structures */
  /*****************************/
  if (ioptions[0] == 1) {
    dbglvl = ioptions[PMV3_OPTION_DBGLVL];
    seed = ioptions[PMV3_OPTION_SEED];
  }
  else {
    dbglvl = GLOBAL_DBGLVL;
    seed = GLOBAL_SEED;
  }
  SetUpCtrl(&ctrl, npes, dbglvl, *comm);
  ctrl.CoarsenTo = amin(vtxdist[npes]+1, 25*incon*amax(npes, inparts));
  ctrl.seed = (seed == 0) ? mype : seed*mype;
  ctrl.sync = GlobalSEMax(&ctrl, seed);
  ctrl.partType = STATIC_PARTITION;
  ctrl.ps_relation = -1;
  ctrl.tpwgts = itpwgts;
  scopy(incon, iubvec, ctrl.ubvec);

  uwgtflag = iwgtflag|2;
  uvwgt = idxsmalloc(vtxdist[mype+1]-vtxdist[mype], 1, "uvwgt");
  graph = Moc_SetUpGraph(&ctrl, 1, vtxdist, xadj, uvwgt, adjncy, adjwgt, &uwgtflag);
  free(graph->nvwgt); graph->nvwgt = NULL;

  PreAllocateMemory(&ctrl, graph, &wspace);

  /*=================================================================
   * Compute the initial npes-way partitioning geometric partitioning
   =================================================================*/
  IFSET(ctrl.dbglvl, DBG_TIME, InitTimers(&ctrl));
  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.TotalTmr));

  Coordinate_Partition(&ctrl, graph, *ndims, xyz, 1, &wspace);

  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.TotalTmr));
  IFSET(ctrl.dbglvl, DBG_TIME, PrintTimingInfo(&ctrl));

  /*=================================================================
   * Move the graph according to the partitioning
   =================================================================*/
  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.MoveTmr));

  free(uvwgt);
  graph->vwgt = ((iwgtflag&2) != 0) ? vwgt : idxsmalloc(graph->nvtxs*incon, 1, "vwgt");
  graph->ncon = incon;
  j = ctrl.nparts;
  ctrl.nparts = ctrl.npes;
  mgraph = Moc_MoveGraph(&ctrl, graph, &wspace);
  ctrl.nparts = j;

  /**********************************************************/
  /* Do the same functionality as Moc_SetUpGraph for mgraph */
  /**********************************************************/
  /* compute tvwgts */
  for (j=0; j<incon; j++)
    ltvwgts[j] = 0;

  for (i=0; i<graph->nvtxs; i++)
    for (j=0; j<incon; j++)
      ltvwgts[j] += mgraph->vwgt[i*incon+j];

  for (j=0; j<incon; j++)
    ctrl.tvwgts[j] = GlobalSESum(&ctrl, ltvwgts[j]);

  /* check for zero wgt constraints */
  for (i=0; i<incon; i++) {
    /* ADD: take care of the case in which tvwgts is zero */
    if (ctrl.tvwgts[i] == 0) {
      if (ctrl.mype == 0) printf("ERROR: sum weight for constraint %d is zero\n", i);
      MPI_Finalize();
      exit(-1);
    }
  }

  /* compute nvwgt */
  mgraph->nvwgt = fmalloc(mgraph->nvtxs*incon, "mgraph->nvwgt");
  for (i=0; i<mgraph->nvtxs; i++)
    for (j=0; j<incon; j++)
      mgraph->nvwgt[i*incon+j] = (float)(mgraph->vwgt[i*incon+j]) / (float)(ctrl.tvwgts[j]);


  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.MoveTmr));

  if (ctrl.dbglvl&DBG_INFO) {
    cut = 0;
    for (i=0; i<graph->nvtxs; i++)
      for (j=graph->xadj[i]; j<graph->xadj[i+1]; j++)
        if (graph->where[i] != graph->where[graph->adjncy[j]])
          cut += graph->adjwgt[j];
    gcut = GlobalSESum(&ctrl, cut)/2;
    maxnvtxs = GlobalSEMax(&ctrl, mgraph->nvtxs);
    balance = (float)(maxnvtxs)/((float)(graph->gnvtxs)/(float)(npes));
    rprintf(&ctrl, "XYZ Cut: %6d \tBalance: %6.3f [%d %d %d]\n",
      gcut, balance, maxnvtxs, graph->gnvtxs, npes);

  }

  /*=================================================================
   * Set up the newly moved graph
   =================================================================*/
  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.TotalTmr));

  ctrl.nparts = inparts;
  FreeWSpace(&wspace);
  PreAllocateMemory(&ctrl, mgraph, &wspace);

  /*=======================================================
   * Now compute the partition of the moved graph
   =======================================================*/
  if (vtxdist[npes] < SMALLGRAPH || vtxdist[npes] < npes*20 || GlobalSESum(&ctrl, mgraph->nedges) == 0) {
    IFSET(ctrl.dbglvl, DBG_INFO, rprintf(&ctrl, "Partitioning a graph of size %d serially\n", vtxdist[npes]));
    PartitionSmallGraph(&ctrl, mgraph, &wspace);
  }
  else {
    Moc_Global_Partition(&ctrl, mgraph, &wspace);
  }
  ParallelReMapGraph(&ctrl, mgraph, &wspace);

  /* Invert the ordering back to the original graph */
  ctrl.nparts = npes;
  ProjectInfoBack(&ctrl, graph, part, mgraph->where, &wspace);

  *edgecut = mgraph->mincut;

  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.TotalTmr));

  /*******************/
  /* Print out stats */
  /*******************/
  IFSET(ctrl.dbglvl, DBG_TIME, PrintTimingInfo(&ctrl));
  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));

  if (ctrl.dbglvl&DBG_INFO) {
    rprintf(&ctrl, "Final %d-way CUT: %6d \tBalance: ", inparts, mgraph->mincut);
    avg = 0.0;
    for (h=0; h<incon; h++) {
      maximb = 0.0;
      for (i=0; i<inparts; i++)
        maximb = amax(maximb, mgraph->gnpwgts[i*incon+h]/itpwgts[i*incon+h]);
      avg += maximb;
      rprintf(&ctrl, "%.3f ", maximb);
    }
    rprintf(&ctrl, "  avg: %.3f\n", avg/(float)incon);
  }

  GKfree((void **)&itpwgts, LTERM);
  FreeGraph(mgraph);
  FreeInitialGraphAndRemap(graph, iwgtflag);
  FreeWSpace(&wspace);
  FreeCtrl(&ctrl);

  if (inumflag == 1)
    ChangeNumbering(vtxdist, xadj, adjncy, part, npes, mype, 0);

}

Esempio n. 4

0

Mostra file

File: rmetis.c Progetto: ADTG-VSSC/SU2

/***********************************************************************************
* This function is the entry point of the parallel multilevel local diffusion
* algorithm. It uses parallel undirected diffusion followed by adaptive k-way 
* refinement. This function utilizes local coarsening.
************************************************************************************/
int ParMETIS_V3_RefineKway(idx_t *vtxdist, idx_t *xadj, idx_t *adjncy, idx_t *vwgt, 
        idx_t *adjwgt, idx_t *wgtflag, idx_t *numflag, idx_t *ncon, idx_t *nparts, 
        real_t *tpwgts, real_t *ubvec, idx_t *options, idx_t *edgecut, idx_t *part, 
        MPI_Comm *comm)
{
  idx_t npes, mype, status;
  ctrl_t *ctrl=NULL;
  graph_t *graph=NULL;
  size_t curmem;


  /* Check the input parameters and return if an error */
  status = CheckInputsPartKway(vtxdist, xadj, adjncy, vwgt, adjwgt, wgtflag, 
               numflag, ncon, nparts, tpwgts, ubvec, options, edgecut, part, comm);
  if (GlobalSEMinComm(*comm, status) == 0) 
    return METIS_ERROR;

  status = METIS_OK;
  gk_malloc_init();
  curmem = gk_GetCurMemoryUsed();

  /* Setup ctrl */
  ctrl = SetupCtrl(PARMETIS_OP_RMETIS, options, *ncon, *nparts, tpwgts, ubvec, *comm);
  npes = ctrl->npes;
  mype = ctrl->mype;


  /* Take care the nparts == 1 case */
  if (*nparts == 1) {
    iset(vtxdist[mype+1]-vtxdist[mype], (*numflag == 0 ? 0 : 1), part); 
    *edgecut = 0;
    goto DONE;
  }


  /* setup the graph */
  if (*numflag > 0) 
    ChangeNumbering(vtxdist, xadj, adjncy, part, npes, mype, 1);

  graph = SetupGraph(ctrl, *ncon, vtxdist, xadj, vwgt, NULL, adjncy, adjwgt, *wgtflag);

  if (ctrl->ps_relation == PARMETIS_PSR_COUPLED)
    iset(graph->nvtxs, mype, graph->home);
  else
    icopy(graph->nvtxs, part, graph->home);


  /* Allocate workspace */
  AllocateWSpace(ctrl, 10*graph->nvtxs);


  /* Partition and Remap */
  STARTTIMER(ctrl, ctrl->TotalTmr);

  ctrl->CoarsenTo = gk_min(vtxdist[npes]+1, 50*(*ncon)*gk_max(npes, *nparts));

  Adaptive_Partition(ctrl, graph);
  ParallelReMapGraph(ctrl, graph);

  icopy(graph->nvtxs, graph->where, part);
  *edgecut = graph->mincut;

  STOPTIMER(ctrl, ctrl->TotalTmr);

  /* Take care of output */
  IFSET(ctrl->dbglvl, DBG_TIME, PrintTimingInfo(ctrl));
  IFSET(ctrl->dbglvl, DBG_TIME, gkMPI_Barrier(ctrl->gcomm));
  IFSET(ctrl->dbglvl, DBG_INFO, PrintPostPartInfo(ctrl, graph, 1));

  FreeInitialGraphAndRemap(graph);

  if (*numflag > 0)
    ChangeNumbering(vtxdist, xadj, adjncy, part, npes, mype, 0);

DONE:
  FreeCtrl(&ctrl);
  if (gk_GetCurMemoryUsed() - curmem > 0) {
    printf("ParMETIS appears to have a memory leak of %zdbytes. Report this.\n",
        (ssize_t)(gk_GetCurMemoryUsed() - curmem));
  }
  gk_malloc_cleanup(0);

  return (int)status;
}

Esempio n. 5

0

Mostra file

File: arith_utilization.cpp Progetto: tony1223yu/GPGPU_MicroBenchmark_Suite

int main(int argc, char *argv[])
{
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue command_queue;
    cl_program program;
    cl_kernel kernel;
    cl_mem buffer;
    cl_int error;
    cl_event event;
    cl_ulong startTime, endTime;
    size_t globalSize[1], localSize[1], warpSize;
    FILE* fptr;
    unsigned long long start, end;

    void* hostData = NULL;

    /* Parse options */
    CommandParser(argc, argv);
    HostDataCreation(hostData);

    GetPlatformAndDevice(platform, device);
    fptr = fopen(g_opencl_ctrl.powerFile, "a");

    /* Create context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
    CHECK_CL_ERROR(error);

    /* Create command queue */
#ifdef USE_CL_2_0_API
    {
        cl_queue_properties property[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
        command_queue = clCreateCommandQueueWithProperties(context, device, property, &error);
    }
#else
    {
        command_queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &error);
    }
#endif
    CHECK_CL_ERROR(error);

    /* Create program */
    CreateAndBuildProgram(program, context, device, strdup(g_opencl_ctrl.fileName));

    /* Create kernels */
    kernel = clCreateKernel(program, g_opencl_ctrl.kernelName, &error);
    CHECK_CL_ERROR(error);

    error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &warpSize, NULL);
    CHECK_CL_ERROR(error);
    fprintf(stderr, "Preferred work group size: %lu\n", warpSize);

#if 0
    fprintf(stderr, "\nData before process:\n");
    switch (g_opencl_ctrl.dataType)
    {
        case TYPE_INT:
            {
                int *intptr = (int *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%d ", intptr[i]);
                fprintf(stderr, "\n");
            }
            break;
        case TYPE_FLOAT:
            {
                float *fltptr = (float *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%f ", fltptr[i]);
                fprintf(stderr, "\n");
            }
            break;
       case TYPE_DOUBLE:
            {
                double *dblptr = (double *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%lf ", dblptr[i]);
                fprintf(stderr, "\n");
            }
            break;
    }
#endif

    /* Create buffers */
    buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, g_opencl_ctrl.dataByte, hostData, &error);
    CHECK_CL_ERROR(error);

    /* Execute kernels */
    error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel, 1, sizeof(long), &g_opencl_ctrl.iteration);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel, 2, sizeof(int), &g_opencl_ctrl.interval);
    CHECK_CL_ERROR(error);

    start = PrintTimingInfo(fptr);

    globalSize[0] = g_opencl_ctrl.global_size;
    localSize[0] = g_opencl_ctrl.local_size;
    error = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalSize, localSize, 0, NULL, &event);
    CHECK_CL_ERROR(error);
    error = clFinish(command_queue);
    CHECK_CL_ERROR(error);

    end = PrintTimingInfo(fptr);
    fclose(fptr);

    error = clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, g_opencl_ctrl.dataByte, hostData, 0, NULL, NULL);
    CHECK_CL_ERROR(error);

#if 0
    fprintf(stderr, "\nData after process:\n");
    switch (g_opencl_ctrl.dataType)
    {
        case TYPE_INT:
            {
                int *intptr = (int *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%d ", intptr[i]);
                fprintf(stderr, "\n");
            }
            break;
        case TYPE_FLOAT:
            {
                float *fltptr = (float *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%f ", fltptr[i]);
                fprintf(stderr, "\n");
            }
            break;
       case TYPE_DOUBLE:
            {
                double *dblptr = (double *)(hostData);
                for (int i = 0 ; i < DATA_SIZE * g_opencl_ctrl.global_size ; i ++)
                    fprintf(stderr, "%lf ", dblptr[i]);
                fprintf(stderr, "\n");
            }
            break;
    }
#endif

    /* Event profiling */
    error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL);
    CHECK_CL_ERROR(error);
    error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(endTime), &endTime, NULL);
    CHECK_CL_ERROR(error);
    fprintf(stderr, "\n['%s' execution time] %llu ns\n", g_opencl_ctrl.kernelName, (end - start) * 1000);
    fprintf(stdout, "%llu\n", (end - start) * 1000);

    /* Read the output */

    /* Release object */
    clReleaseKernel(kernel);
    clReleaseMemObject(buffer);
    clReleaseEvent(event);
    clReleaseProgram(program);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);
    free(hostData);

    return 0;
}

Esempio n. 6

0

Mostra file

File: matrix.cpp Progetto: tony1223yu/GPGPU_MicroBenchmark_Suite

int main(int argc, char *argv[])
{
    FILE* g_fptr;
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue command_queue;
    cl_program program;
    cl_kernel kernel1, kernel2;
    cl_mem inputBufferA;
    cl_int error;
    size_t globalSize[2], localSize[2];

    struct timeval startTime, endTime;

    void* inputMatrixA = NULL;
    /* Parse options */
    CommandParser(argc, argv);

    g_fptr = fopen(g_opencl_ctrl.powerFile, "a");
    if (!g_fptr)
        exit(1);

    HostDataCreation(inputMatrixA);

    GetPlatformAndDevice(platform, device);

    /* Create context */
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
    CHECK_CL_ERROR(error);

    /* Create command queue */
    command_queue = clCreateCommandQueue(context, device, 0, &error);
    CHECK_CL_ERROR(error);

    /* Create program */
    CreateAndBuildProgram(program, context, device, strdup(CL_FILE_NAME));

    /* Create kernels */
    kernel1 = clCreateKernel(program, "Generate", &error);
    CHECK_CL_ERROR(error);
    kernel2 = clCreateKernel(program, "Access", &error);
    CHECK_CL_ERROR(error);

    /* Create buffers */
    inputBufferA = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, g_opencl_ctrl.inputByteA, inputMatrixA, &error);
    CHECK_CL_ERROR(error);

    /* Execute kernels */
    error = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &inputBufferA);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel1, 1, sizeof(int), &g_opencl_ctrl.dataSizeW);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel1, 2, sizeof(int), &g_opencl_ctrl.dataSizeH);
    CHECK_CL_ERROR(error);

    error = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &inputBufferA);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel2, 1, sizeof(int), &g_opencl_ctrl.dataSizeW);
    CHECK_CL_ERROR(error);
    error = clSetKernelArg(kernel2, 2, sizeof(int), &g_opencl_ctrl.iteration);
    CHECK_CL_ERROR(error);


    globalSize[0] = g_opencl_ctrl.dataSizeW;
    globalSize[1] = g_opencl_ctrl.dataSizeH;
    localSize[0] = g_opencl_ctrl.local_size1;
    localSize[1] = g_opencl_ctrl.local_size2;

    fprintf(stderr, "global size: %lu %lu\n", globalSize[0], globalSize[1]);
    fprintf(stderr, "local size: %lu %lu\n", localSize[0], localSize[1]);

    error = clEnqueueNDRangeKernel(command_queue, kernel1, 2, NULL, globalSize, localSize, 0, NULL, NULL);
    CHECK_CL_ERROR(error);
    error = clFinish(command_queue);
    CHECK_CL_ERROR(error);
 
    PrintTimingInfo(g_fptr);

    if (g_opencl_ctrl.timing)
        gettimeofday(&startTime, NULL);

    error = clEnqueueNDRangeKernel(command_queue, kernel2, 2, NULL, globalSize, localSize, 0, NULL, NULL);
    CHECK_CL_ERROR(error);
    error = clFinish(command_queue);
    CHECK_CL_ERROR(error);
    PrintTimingInfo(g_fptr);

    if (g_opencl_ctrl.timing)
        gettimeofday(&endTime, NULL);

    fclose(g_fptr);

    /* Read the output */
    error = clEnqueueReadBuffer(command_queue, inputBufferA, CL_TRUE, 0, g_opencl_ctrl.inputByteA, inputMatrixA, 0, NULL, NULL);
    CHECK_CL_ERROR(error);

    /* Release object */
    clReleaseKernel(kernel1);
    clReleaseKernel(kernel2);
    clReleaseMemObject(inputBufferA);
    clReleaseProgram(program);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);
    free(inputMatrixA);

    if (g_opencl_ctrl.timing)
    {
        unsigned long long start, end;
        start = startTime.tv_sec * 1000000 + startTime.tv_usec;
        end = endTime.tv_sec * 1000000 + endTime.tv_usec;

        fprintf(stderr, "Kernel execution time: %llu ms\n", (end - start) / 1000);
        fprintf(stdout, "%llu\n", (end - start) * 1000);
    }

    fprintf(stderr, "DONE.\n");

    return 0;
}

Esempio n. 7

0

Mostra file

File: gkmetis.c Progetto: certik/libmesh

/***********************************************************************************
* This function is the entry point of the parallel kmetis algorithm that uses
* coordinates to compute an initial graph distribution.
************************************************************************************/
int ParMETIS_V3_PartGeomKway(idx_t *vtxdist, idx_t *xadj, idx_t *adjncy,
        idx_t *vwgt, idx_t *adjwgt, idx_t *wgtflag, idx_t *numflag, idx_t *ndims, 
	real_t *xyz, idx_t *ncon, idx_t *nparts, real_t *tpwgts, real_t *ubvec, 
	idx_t *options, idx_t *edgecut, idx_t *part, MPI_Comm *comm)
{
  idx_t h, i, j, npes, mype, status, nvtxs, seed, dbglvl;
  idx_t cut, gcut, maxnvtxs;
  idx_t moptions[METIS_NOPTIONS];
  ctrl_t *ctrl;
  graph_t *graph, *mgraph;
  real_t balance;
  size_t curmem;

  /* Check the input parameters and return if an error */
  status = CheckInputsPartGeomKway(vtxdist, xadj, adjncy, vwgt, adjwgt, wgtflag,
                numflag, ndims, xyz, ncon, nparts, tpwgts, ubvec, options, 
                edgecut, part, comm);
  if (GlobalSEMinComm(*comm, status) == 0)
    return METIS_ERROR;

  status = METIS_OK;
  gk_malloc_init();
  curmem = gk_GetCurMemoryUsed();

  /* Setup the ctrl */
  ctrl = SetupCtrl(PARMETIS_OP_GKMETIS, options, *ncon, *nparts, tpwgts, ubvec, *comm);
  npes = ctrl->npes;
  mype = ctrl->mype;

  /* Take care the nparts == 1 case */
  if (*nparts == 1) {
    iset(vtxdist[mype+1]-vtxdist[mype], (*numflag == 0 ? 0 : 1), part);
    *edgecut = 0;
    goto DONE;
  }


  /* Take care of npes == 1 case */
  if (npes == 1) {
    nvtxs = vtxdist[1] - vtxdist[0];  /* subtraction is required when numflag==1 */

    METIS_SetDefaultOptions(moptions);
    moptions[METIS_OPTION_NUMBERING] = *numflag;

    status = METIS_PartGraphKway(&nvtxs, ncon, xadj, adjncy, vwgt, NULL, adjwgt, 
                 nparts, tpwgts, ubvec, moptions, edgecut, part);

    goto DONE;
  }


  /* Setup the graph */
  if (*numflag > 0)
    ChangeNumbering(vtxdist, xadj, adjncy, part, npes, mype, 1);

  graph = SetupGraph(ctrl, *ncon, vtxdist, xadj, vwgt, NULL, adjncy, adjwgt, *wgtflag);
  gk_free((void **)&graph->nvwgt, LTERM); 


  /* Allocate the workspace */
  AllocateWSpace(ctrl, 10*graph->nvtxs);


  /* Compute the initial npes-way partitioning geometric partitioning */
  STARTTIMER(ctrl, ctrl->TotalTmr);

  Coordinate_Partition(ctrl, graph, *ndims, xyz, 1);

  STOPTIMER(ctrl, ctrl->TotalTmr);


  /* Move the graph according to the partitioning */
  STARTTIMER(ctrl, ctrl->MoveTmr);

  ctrl->nparts = npes;
  mgraph = MoveGraph(ctrl, graph);
  ctrl->nparts = *nparts;

  SetupGraph_nvwgts(ctrl, mgraph); /* compute nvwgts for the moved graph */

  if (ctrl->dbglvl&DBG_INFO) {
    CommInterfaceData(ctrl, graph, graph->where, graph->where+graph->nvtxs);
    for (cut=0, i=0; i<graph->nvtxs; i++) {
      for (j=graph->xadj[i]; j<graph->xadj[i+1]; j++) {
        if (graph->where[i] != graph->where[graph->adjncy[j]])
          cut += graph->adjwgt[j];
      }
    }
    gcut     = GlobalSESum(ctrl, cut)/2;
    maxnvtxs = GlobalSEMax(ctrl, mgraph->nvtxs);
    balance  = (real_t)(maxnvtxs)/((real_t)(graph->gnvtxs)/(real_t)(npes));
    rprintf(ctrl, "XYZ Cut: %6"PRIDX" \tBalance: %6.3"PRREAL" [%"PRIDX" %"PRIDX" %"PRIDX"]\n",
       gcut, balance, maxnvtxs, graph->gnvtxs, npes);
  }

  STOPTIMER(ctrl, ctrl->MoveTmr);


  /* Compute the partition of the moved graph */
  STARTTIMER(ctrl, ctrl->TotalTmr);

  ctrl->CoarsenTo = gk_min(vtxdist[npes]+1, 25*(*ncon)*gk_max(npes, *nparts));

  if (vtxdist[npes] < SMALLGRAPH 
      || vtxdist[npes] < npes*20 
      || GlobalSESum(ctrl, mgraph->nedges) == 0) { /* serially */
    IFSET(ctrl->dbglvl, DBG_INFO, 
        rprintf(ctrl, "Partitioning a graph of size %"PRIDX" serially\n", vtxdist[npes]));
    PartitionSmallGraph(ctrl, mgraph);
  }
  else { /* in parallel */
    Global_Partition(ctrl, mgraph);
  }

  ParallelReMapGraph(ctrl, mgraph);

  /* Invert the ordering back to the original graph */
  ctrl->nparts = npes;
  ProjectInfoBack(ctrl, graph, part, mgraph->where);
  ctrl->nparts = *nparts;

  *edgecut = mgraph->mincut;

  STOPTIMER(ctrl, ctrl->TotalTmr);


  /* Print some stats */
  IFSET(ctrl->dbglvl, DBG_TIME, PrintTimingInfo(ctrl));
  IFSET(ctrl->dbglvl, DBG_TIME, gkMPI_Barrier(ctrl->gcomm));
  IFSET(ctrl->dbglvl, DBG_INFO, PrintPostPartInfo(ctrl, mgraph, 0));

  FreeGraph(mgraph);
  FreeInitialGraphAndRemap(graph);

  if (*numflag > 0)
    ChangeNumbering(vtxdist, xadj, adjncy, part, npes, mype, 0);

DONE:
  FreeCtrl(&ctrl);
  if (gk_GetCurMemoryUsed() - curmem > 0) {
    printf("ParMETIS appears to have a memory leak of %zdbytes. Report this.\n",
        (ssize_t)(gk_GetCurMemoryUsed() - curmem));
  }
  gk_malloc_cleanup(0);

  return (int)status;
}

Esempio n. 8

0

Mostra file

File: gkmetis.c Progetto: certik/libmesh

/***********************************************************************************
* This function is the entry point of the parallel ordering algorithm.
* This function assumes that the graph is already nice partitioned among the
* processors and then proceeds to perform recursive bisection.
************************************************************************************/
int ParMETIS_V3_PartGeom(idx_t *vtxdist, idx_t *ndims, real_t *xyz, idx_t *part, 
         MPI_Comm *comm)
{
  idx_t i, nvtxs, firstvtx, npes, mype, status;
  idx_t *xadj, *adjncy;
  ctrl_t *ctrl=NULL;
  graph_t *graph=NULL;
  size_t curmem;


  /* Check the input parameters and return if an error */
  status = CheckInputsPartGeom(vtxdist, ndims, xyz, part, comm);
  if (GlobalSEMinComm(*comm, status) == 0)
    return METIS_ERROR;

  status = METIS_OK;
  gk_malloc_init();
  curmem = gk_GetCurMemoryUsed();

  /* Setup the ctrl */
  ctrl = SetupCtrl(PARMETIS_OP_GMETIS, NULL, 1, 1, NULL, NULL, *comm);
  /*ctrl->dbglvl=15;*/
  npes = ctrl->npes;
  mype = ctrl->mype;


  /* Trivial case when npes == 1 */
  if (npes == 1) {
    iset(vtxdist[mype+1]-vtxdist[mype], 0, part);
    goto DONE;
  }


  /* Setup a fake graph to allow the rest of the code to work unchanged */
  nvtxs    = vtxdist[mype+1]-vtxdist[mype];
  firstvtx = vtxdist[mype];
  xadj     = imalloc(nvtxs+1, "ParMETIS_PartGeom: xadj");
  adjncy   = imalloc(nvtxs, "ParMETIS_PartGeom: adjncy");
  for (i=0; i<nvtxs; i++) {
    xadj[i] = i;
    adjncy[i] = firstvtx + (i+1)%nvtxs;
  }
  xadj[nvtxs] = nvtxs;

  graph = SetupGraph(ctrl, 1, vtxdist, xadj, NULL, NULL, adjncy, NULL, 0);


  /* Allocate workspace memory */
  AllocateWSpace(ctrl, 5*graph->nvtxs);


  /* Compute the initial geometric partitioning */
  STARTTIMER(ctrl, ctrl->TotalTmr);

  Coordinate_Partition(ctrl, graph, *ndims, xyz, 0);
  icopy(graph->nvtxs, graph->where, part);

  STOPTIMER(ctrl, ctrl->TotalTmr);
  IFSET(ctrl->dbglvl, DBG_TIME, PrintTimingInfo(ctrl));


  gk_free((void **)&xadj, (void **)&adjncy, LTERM);
  FreeInitialGraphAndRemap(graph);


DONE:
  FreeCtrl(&ctrl);
  if (gk_GetCurMemoryUsed() - curmem > 0) {
    printf("ParMETIS appears to have a memory leak of %zdbytes. Report this.\n",
        (ssize_t)(gk_GetCurMemoryUsed() - curmem));
  }
  gk_malloc_cleanup(0);

  return (int)status;
}

Esempio n. 9

0

Mostra file

File: pspases.c Progetto: ADTG-VSSC/SU2

/***********************************************************************************
* This function is the entry point of the serial ordering algorithm.
************************************************************************************/
int ParMETIS_SerialNodeND(idx_t *vtxdist, idx_t *xadj, idx_t *adjncy, 
        idx_t *numflag, idx_t *options, idx_t *order, idx_t *sizes, 
        MPI_Comm *comm)
{
  idx_t i, npes, mype;
  ctrl_t *ctrl=NULL;
  graph_t *agraph=NULL;
  idx_t *perm=NULL, *iperm=NULL;
  idx_t *sendcount, *displs;

  /* Setup the ctrl */
  ctrl = SetupCtrl(PARMETIS_OP_OMETIS, options, 1, 1, NULL, NULL, *comm);
  npes = ctrl->npes;
  mype = ctrl->mype;

  if (!ispow2(npes)) {
    if (mype == 0)
      printf("Error: The number of processors must be a power of 2!\n");
    FreeCtrl(&ctrl);
    return METIS_ERROR;
  }


  if (*numflag > 0) 
    ChangeNumbering(vtxdist, xadj, adjncy, order, npes, mype, 1);

  STARTTIMER(ctrl, ctrl->TotalTmr);
  STARTTIMER(ctrl, ctrl->MoveTmr);

  agraph = AssembleEntireGraph(ctrl, vtxdist, xadj, adjncy);

  STOPTIMER(ctrl, ctrl->MoveTmr);

  if (mype == 0) {
    perm  = imalloc(agraph->nvtxs, "PAROMETISS: perm");
    iperm = imalloc(agraph->nvtxs, "PAROMETISS: iperm");

    METIS_NodeNDP(agraph->nvtxs, agraph->xadj, agraph->adjncy, 
        agraph->vwgt, npes, NULL, perm, iperm, sizes);
  }

  STARTTIMER(ctrl, ctrl->MoveTmr);

  /* Broadcast the sizes array */
  gkMPI_Bcast((void *)sizes, 2*npes, IDX_T, 0, ctrl->gcomm);

  /* Scatter the iperm */
  sendcount = imalloc(npes, "PAROMETISS: sendcount");
  displs = imalloc(npes, "PAROMETISS: displs");
  for (i=0; i<npes; i++) {
    sendcount[i] = vtxdist[i+1]-vtxdist[i];
    displs[i] = vtxdist[i];
  }

  gkMPI_Scatterv((void *)iperm, sendcount, displs, IDX_T, (void *)order, 
      vtxdist[mype+1]-vtxdist[mype], IDX_T, 0, ctrl->gcomm);

  STOPTIMER(ctrl, ctrl->MoveTmr);
  STOPTIMER(ctrl, ctrl->TotalTmr);
  IFSET(ctrl->dbglvl, DBG_TIME, PrintTimingInfo(ctrl));
  IFSET(ctrl->dbglvl, DBG_TIME, gkMPI_Barrier(ctrl->gcomm));

  gk_free((void **)&agraph->xadj, &agraph->adjncy, &perm, &iperm, 
      &sendcount, &displs, &agraph, LTERM);

  if (*numflag > 0) 
    ChangeNumbering(vtxdist, xadj, adjncy, order, npes, mype, 0);

  goto DONE;

DONE:
  FreeCtrl(&ctrl);
  return METIS_OK;
}

Esempio n. 10

0

Mostra file

File: rmetis.c Progetto: KnoooW/gpgpu-sim

/***********************************************************************************
* This function is the entry point of the parallel multilevel local diffusion
* algorithm. It uses parallel undirected diffusion followed by adaptive k-way 
* refinement. This function utilizes local coarsening.
************************************************************************************/
void ParMETIS_V3_RefineKway(idxtype *vtxdist, idxtype *xadj, idxtype *adjncy,
              idxtype *vwgt, idxtype *adjwgt, int *wgtflag, int *numflag, int *ncon, 
	      int *nparts, float *tpwgts, float *ubvec, int *options, int *edgecut, 
	      idxtype *part, MPI_Comm *comm)
{
  int h, i;
  int npes, mype;
  CtrlType ctrl;
  WorkSpaceType wspace;
  GraphType *graph;
  int tewgt, tvsize, nmoved, maxin, maxout;
  float gtewgt, gtvsize, avg, maximb;
  int ps_relation, seed, dbglvl = 0;
  int iwgtflag, inumflag, incon, inparts, ioptions[10];
  float *itpwgts, iubvec[MAXNCON];

  MPI_Comm_size(*comm, &npes);
  MPI_Comm_rank(*comm, &mype);

  /********************************/
  /* Try and take care bad inputs */
  /********************************/
  if (options != NULL && options[0] == 1)
    dbglvl = options[PMV3_OPTION_DBGLVL];
  CheckInputs(REFINE_PARTITION, npes, dbglvl, wgtflag, &iwgtflag, numflag, &inumflag,
              ncon, &incon, nparts, &inparts, tpwgts, &itpwgts, ubvec, iubvec, 
              NULL, NULL, options, ioptions, part, comm);

  /* ADD: take care of disconnected graph */
  /* ADD: take care of highly unbalanced vtxdist */
  /*********************************/
  /* Take care the nparts = 1 case */
  /*********************************/
  if (inparts <= 1) {
    idxset(vtxdist[mype+1]-vtxdist[mype], 0, part); 
    *edgecut = 0;
    return;
  }

  /**************************/
  /* Set up data structures */
  /**************************/
  if (inumflag == 1) 
    ChangeNumbering(vtxdist, xadj, adjncy, part, npes, mype, 1);

  /*****************************/
  /* Set up control structures */
  /*****************************/
  if (ioptions[0] == 1) {
    dbglvl = ioptions[PMV3_OPTION_DBGLVL];
    seed = ioptions[PMV3_OPTION_SEED];
    ps_relation = (npes == inparts) ? ioptions[PMV3_OPTION_PSR] : DISCOUPLED;
  }
  else {
    dbglvl = GLOBAL_DBGLVL;
    seed = GLOBAL_SEED;
    ps_relation = (npes == inparts) ? COUPLED : DISCOUPLED;
  }

  SetUpCtrl(&ctrl, inparts, dbglvl, *comm);
  ctrl.CoarsenTo = amin(vtxdist[npes]+1, 50*incon*amax(npes, inparts));
  ctrl.ipc_factor = 1000.0;
  ctrl.redist_factor = 1.0;
  ctrl.redist_base = 1.0;
  ctrl.seed = (seed == 0) ? mype : seed*mype;
  ctrl.sync = GlobalSEMax(&ctrl, seed);
  ctrl.partType = REFINE_PARTITION;
  ctrl.ps_relation = ps_relation;
  ctrl.tpwgts = itpwgts;

  graph = Moc_SetUpGraph(&ctrl, incon, vtxdist, xadj, vwgt, adjncy, adjwgt, &iwgtflag);
  graph->vsize = idxsmalloc(graph->nvtxs, 1, "vsize");

  graph->home = idxmalloc(graph->nvtxs, "home");
  if (ctrl.ps_relation == COUPLED)
    idxset(graph->nvtxs, mype, graph->home);
  else
    idxcopy(graph->nvtxs, part, graph->home);

  tewgt   = idxsum(graph->nedges, graph->adjwgt);
  tvsize  = idxsum(graph->nvtxs, graph->vsize);
  gtewgt  = (float) GlobalSESum(&ctrl, tewgt) + 1.0/graph->gnvtxs;
  gtvsize = (float) GlobalSESum(&ctrl, tvsize) + 1.0/graph->gnvtxs;
  ctrl.edge_size_ratio = gtewgt/gtvsize;
  scopy(incon, iubvec, ctrl.ubvec);

  PreAllocateMemory(&ctrl, graph, &wspace);

  /***********************/
  /* Partition and Remap */
  /***********************/
  IFSET(ctrl.dbglvl, DBG_TIME, InitTimers(&ctrl));
  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, starttimer(ctrl.TotalTmr));

  Adaptive_Partition(&ctrl, graph, &wspace);
  ParallelReMapGraph(&ctrl, graph, &wspace);

  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));
  IFSET(ctrl.dbglvl, DBG_TIME, stoptimer(ctrl.TotalTmr));

  idxcopy(graph->nvtxs, graph->where, part);
  if (edgecut != NULL)
    *edgecut = graph->mincut;

  /***********************/
  /* Take care of output */
  /***********************/
  IFSET(ctrl.dbglvl, DBG_TIME, PrintTimingInfo(&ctrl));
  IFSET(ctrl.dbglvl, DBG_TIME, MPI_Barrier(ctrl.gcomm));

  if (ctrl.dbglvl&DBG_INFO) {
    Mc_ComputeMoveStatistics(&ctrl, graph, &nmoved, &maxin, &maxout);
    rprintf(&ctrl, "Final %3d-way Cut: %6d \tBalance: ", inparts, graph->mincut);
    avg = 0.0;
    for (h=0; h<incon; h++) {
      maximb = 0.0;
      for (i=0; i<inparts; i++)
        maximb = amax(maximb, graph->gnpwgts[i*incon+h]/itpwgts[i*incon+h]);
      avg += maximb;
      rprintf(&ctrl, "%.3f ", maximb);
    }
    rprintf(&ctrl, "\nNMoved: %d %d %d %d\n", nmoved, maxin, maxout, maxin+maxout);
  }

  /*************************************/
  /* Free memory, renumber, and return */
  /*************************************/
  GKfree((void **)&graph->lnpwgts, (void **)&graph->gnpwgts, (void **)&graph->nvwgt, (void **)(&graph->home), (void **)(&graph->vsize), LTERM);

  GKfree((void **)&itpwgts, LTERM);
  FreeInitialGraphAndRemap(graph, iwgtflag);
  FreeWSpace(&wspace);
  FreeCtrl(&ctrl);

  if (inumflag == 1)
    ChangeNumbering(vtxdist, xadj, adjncy, part, npes, mype, 0);

  return;
}