void sync_layer(network *nets, int n, int j)
{
    //printf("Syncing layer %d\n", j);
    int i;
    network net = nets[0];
    layer base = net.layers[j];
    cuda_set_device(net.gpu_index);
    pull_weights(base);
    for (i = 1; i < n; ++i) {
        cuda_set_device(nets[i].gpu_index);
        layer l = nets[i].layers[j];
        pull_weights(l);
        merge_weights(l, base);
    }
    scale_weights(base, 1./n);
    for (i = 0; i < n; ++i) {
        cuda_set_device(nets[i].gpu_index);
        layer l = nets[i].layers[j];
        distribute_weights(l, base);
    }
    //printf("Done syncing layer %d\n", j);
}
void AlgParMETIS<Adapter>::partition(
  const RCP<PartitioningSolution<Adapter> > &solution
)
{
  HELLO;

  size_t numGlobalParts = solution->getTargetGlobalNumberOfParts();

  int np = problemComm->getSize();

  // Get vertex info
  ArrayView<const gno_t> vtxgnos;
  ArrayView<StridedData<lno_t, scalar_t> > vwgts;
  int nVwgt = model->getNumWeightsPerVertex();
  size_t nVtx = model->getVertexList(vtxgnos, vwgts);
  pm_idx_t pm_nVtx;
  TPL_Traits<pm_idx_t,size_t>::ASSIGN_TPL_T(pm_nVtx, nVtx);

  pm_idx_t *pm_vwgts = NULL;
  if (nVwgt) {
    pm_vwgts = new pm_idx_t[nVtx*nVwgt];
    scale_weights(nVtx, vwgts, pm_vwgts);
  }

  // Get edge info
  ArrayView<const gno_t> adjgnos;
  ArrayView<const lno_t> offsets;
  ArrayView<StridedData<lno_t, scalar_t> > ewgts;
  int nEwgt = model->getNumWeightsPerEdge();
  size_t nEdge = model->getEdgeList(adjgnos, offsets, ewgts);

  pm_idx_t *pm_ewgts = NULL;
  if (nEwgt) {
    pm_ewgts = new pm_idx_t[nEdge*nEwgt]; 
    scale_weights(nEdge, ewgts, pm_ewgts);
  }

  // Convert index types for edges, if needed
  pm_idx_t *pm_offsets;  
  TPL_Traits<pm_idx_t,const lno_t>::ASSIGN_TPL_T_ARRAY(&pm_offsets, offsets);
  pm_idx_t *pm_adjs;  
  pm_idx_t pm_dummy_adj;
  if (nEdge)
    TPL_Traits<pm_idx_t,const gno_t>::ASSIGN_TPL_T_ARRAY(&pm_adjs, adjgnos);
  else
    pm_adjs = &pm_dummy_adj;  // ParMETIS does not like NULL pm_adjs;
    

  // Build vtxdist
  pm_idx_t *pm_vtxdist;
  ArrayView<size_t> vtxdist; 
  model->getVertexDist(vtxdist);
  TPL_Traits<pm_idx_t,size_t>::ASSIGN_TPL_T_ARRAY(&pm_vtxdist, vtxdist);

  // ParMETIS does not like processors having no vertices.
  // Inspect vtxdist and remove from communicator procs that have no vertices
  RCP<Comm<int> > subcomm;
  MPI_Comm mpicomm;  // Note:  mpicomm is valid only while subcomm is in scope

  if (np > 1) {
    int nKeep = 0;
    Array<int> keepRanks(np);
    for (int i = 0; i < np; i++) {
      if ((pm_vtxdist[i+1] - pm_vtxdist[i]) > 0) {
        keepRanks[nKeep] = i;
        pm_vtxdist[nKeep] = pm_vtxdist[i];
        nKeep++;
      }
    }
    pm_vtxdist[nKeep] = pm_vtxdist[np];
    if (nKeep < np) {
      subcomm = problemComm->createSubcommunicator(keepRanks.view(0,nKeep));
      if (subcomm != Teuchos::null) 
        mpicomm = Teuchos::getRawMpiComm(*subcomm);
      else 
        mpicomm = MPI_COMM_NULL;
    }
    else {
      mpicomm = Teuchos::getRawMpiComm(*problemComm);
    }
  }
  else {
    mpicomm = Teuchos::getRawMpiComm(*problemComm);
  }

  // Create array for ParMETIS to return results in.
  pm_idx_t *pm_partList = NULL;
  if (nVtx) pm_partList = new pm_idx_t[nVtx];

  if (mpicomm != MPI_COMM_NULL) {
    // If in ParMETIS' communicator (i.e., have vertices), call ParMETIS

    // Get target part sizes 
    pm_idx_t pm_nCon = (nVwgt == 0 ? 1 : pm_idx_t(nVwgt));
    pm_real_t *pm_partsizes = new pm_real_t[numGlobalParts*pm_nCon];
    for (pm_idx_t dim = 0; dim < pm_nCon; dim++) {
      if (!solution->criteriaHasUniformPartSizes(dim))
        for (size_t i=0; i<numGlobalParts; i++)
          pm_partsizes[i*pm_nCon+dim] = 
                       pm_real_t(solution->getCriteriaPartSize(dim,i));
      else
        for (size_t i=0; i<numGlobalParts; i++)
          pm_partsizes[i*pm_nCon+dim] = pm_real_t(1.)/pm_real_t(numGlobalParts);
    }

    // Get imbalance tolerances
    double tolerance = 1.1;
    const Teuchos::ParameterList &pl = env->getParameters();
    const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
    if (pe) tolerance = pe->getValue<double>(&tolerance);

    pm_real_t *pm_imbTols = new pm_real_t[pm_nCon];
    for (pm_idx_t dim = 0; dim < pm_nCon; dim++)
      pm_imbTols[dim] = pm_real_t(tolerance);

    std::string parmetis_method("PARTKWAY");
    pe = pl.getEntryPtr("partitioning_approach");
    if (pe){
      std::string approach;
      approach = pe->getValue<std::string>(&approach);
      if ((approach == "repartition") || (approach == "maximize_overlap"))
        parmetis_method = "REFINE_KWAY";
      // TODO:  AdaptiveRepart
    }

    // Other ParMETIS parameters?
    pm_idx_t pm_wgtflag = 2*(nVwgt > 0) + (nEwgt > 0);
    pm_idx_t pm_numflag = 0;
    pm_idx_t pm_edgecut = -1;
    pm_idx_t pm_options[METIS_NOPTIONS];
    pm_options[0] = 1;   // Use non-default options for some ParMETIS options
    for (int i = 0; i < METIS_NOPTIONS; i++) 
      pm_options[i] = 0; // Default options
    pm_options[2] = 15;  // Matches default value used in Zoltan
  
    pm_idx_t pm_nPart;
    TPL_Traits<pm_idx_t,size_t>::ASSIGN_TPL_T(pm_nPart, numGlobalParts);

    if (parmetis_method == "PARTKWAY") {
      ParMETIS_V3_PartKway(pm_vtxdist, pm_offsets, pm_adjs, pm_vwgts, pm_ewgts,
                           &pm_wgtflag, &pm_numflag, &pm_nCon, &pm_nPart,
                           pm_partsizes, pm_imbTols, pm_options,
                           &pm_edgecut, pm_partList, &mpicomm);
    }
    else if (parmetis_method == "ADAPTIVE_REPART") {
      // Get object sizes:  pm_vsize
      std::cout << "NOT READY FOR ADAPTIVE_REPART YET; NEED VSIZE" << std::endl;
      exit(-1);
      //pm_real_t itr = 100.;  // Same default as in Zoltan
      //ParMETIS_V3_AdaptiveRepart(pm_vtxdist, pm_offsets, pm_adjs, pm_vwgts,
      //                           pm_vsize, pm_ewgts, &pm_wgtflag,
      //                           &pm_numflag, &pm_nCon, &pm_nPart,
      //                           pm_partsizes, pm_imbTols,
      //                           &itr, pm_options,
      //                           &pm_edgecut, pm_partList, &mpicomm);
    }
    else if (parmetis_method == "REFINE_KWAY") {
      ParMETIS_V3_RefineKway(pm_vtxdist, pm_offsets, pm_adjs, pm_vwgts,
                             pm_ewgts,
                             &pm_wgtflag, &pm_numflag, &pm_nCon, &pm_nPart,
                             pm_partsizes, pm_imbTols,
                             pm_options, &pm_edgecut, pm_partList, &mpicomm);
    }

    // Clean up 
    delete [] pm_partsizes;
    delete [] pm_imbTols;
  }

  // Load answer into the solution.

  ArrayRCP<part_t> partList;
  if (nVtx) {
    if (TPL_Traits<pm_idx_t, part_t>::OK_TO_CAST_TPL_T()) {
      partList = ArrayRCP<part_t>((part_t *)pm_partList, 0, nVtx, true);
    }
    else {
      // TODO Probably should have a TPL_Traits function to do the following
      partList = ArrayRCP<part_t>(new part_t[nVtx], 0, nVtx, true);
      for (size_t i = 0; i < nVtx; i++) {
        partList[i] = part_t(pm_partList[i]);
      }
      delete [] pm_partList;
    }
  }

  solution->setParts(partList);

  env->memory("Zoltan2-ParMETIS: After creating solution");

  // Clean up copies made due to differing data sizes.
  TPL_Traits<pm_idx_t,size_t>::DELETE_TPL_T_ARRAY(&pm_vtxdist);
  TPL_Traits<pm_idx_t,const lno_t>::DELETE_TPL_T_ARRAY(&pm_offsets);
  if (nEdge)
    TPL_Traits<pm_idx_t,const gno_t>::DELETE_TPL_T_ARRAY(&pm_adjs);

  if (nVwgt) delete [] pm_vwgts;
  if (nEwgt) delete [] pm_ewgts;
}
void AlgPTScotch<Adapter>::partition(
  const RCP<PartitioningSolution<Adapter> > &solution
)
{
  HELLO;

  size_t numGlobalParts = solution->getTargetGlobalNumberOfParts();

  SCOTCH_Num partnbr=0;
  TPL_Traits<SCOTCH_Num, size_t>::ASSIGN_TPL_T(partnbr, numGlobalParts, env);

#ifdef HAVE_ZOLTAN2_MPI
  int ierr = 0;
  int me = problemComm->getRank();

  const SCOTCH_Num  baseval = 0;  // Base value for array indexing.
                                  // GraphModel returns GNOs from base 0.

  SCOTCH_Strat stratstr;          // Strategy string
                                  // TODO:  Set from parameters
  SCOTCH_stratInit(&stratstr);

  // Allocate and initialize PTScotch Graph data structure.
  SCOTCH_Dgraph *gr = SCOTCH_dgraphAlloc();  // Scotch distributed graph
  ierr = SCOTCH_dgraphInit(gr, mpicomm);

  env->globalInputAssertion(__FILE__, __LINE__, "SCOTCH_dgraphInit", 
    !ierr, BASIC_ASSERTION, problemComm);

  // Get vertex info
  ArrayView<const gno_t> vtxID;
  ArrayView<StridedData<lno_t, scalar_t> > xyz;
  ArrayView<StridedData<lno_t, scalar_t> > vwgts;
  size_t nVtx = model->getVertexList(vtxID, xyz, vwgts);
  SCOTCH_Num vertlocnbr=0;
  TPL_Traits<SCOTCH_Num, size_t>::ASSIGN_TPL_T(vertlocnbr, nVtx, env);
  SCOTCH_Num vertlocmax = vertlocnbr; // Assumes no holes in global nums.

  // Get edge info
  ArrayView<const gno_t> edgeIds;
  ArrayView<const int>   procIds;
  ArrayView<const lno_t> offsets;
  ArrayView<StridedData<lno_t, scalar_t> > ewgts;

  size_t nEdge = model->getEdgeList(edgeIds, procIds, offsets, ewgts);

  SCOTCH_Num edgelocnbr=0;
  TPL_Traits<SCOTCH_Num, size_t>::ASSIGN_TPL_T(edgelocnbr, nEdge, env);
  const SCOTCH_Num edgelocsize = edgelocnbr;  // Assumes adj array is compact.

  SCOTCH_Num *vertloctab;  // starting adj/vtx
  TPL_Traits<SCOTCH_Num, lno_t>::ASSIGN_TPL_T_ARRAY(&vertloctab, offsets, env);

  SCOTCH_Num *edgeloctab;  // adjacencies
  TPL_Traits<SCOTCH_Num, gno_t>::ASSIGN_TPL_T_ARRAY(&edgeloctab, edgeIds, env);

  // We don't use these arrays, but we need them as arguments to Scotch.
  SCOTCH_Num *vendloctab = NULL;  // Assume consecutive storage for adj
  SCOTCH_Num *vlblloctab = NULL;  // Vertex label array
  SCOTCH_Num *edgegsttab = NULL;  // Array for ghost vertices

  // Get weight info.
  SCOTCH_Num *velotab = NULL;  // Vertex weights
  SCOTCH_Num *edlotab = NULL;  // Edge weights

  int nVwgts = model->getNumWeightsPerVertex();
  int nEwgts = model->getNumWeightsPerEdge();
  if (nVwgts > 1 && me == 0) {
    std::cerr << "Warning:  NumWeightsPerVertex is " << nVwgts 
              << " but Scotch allows only one weight. "
              << " Zoltan2 will use only the first weight per vertex."
              << std::endl;
  }
  if (nEwgts > 1 && me == 0) {
    std::cerr << "Warning:  NumWeightsPerEdge is " << nEwgts 
              << " but Scotch allows only one weight. "
              << " Zoltan2 will use only the first weight per edge."
              << std::endl;
  }

  if (nVwgts) {
    velotab = new SCOTCH_Num[nVtx+1];  // +1 since Scotch wants all procs 
                                       // to have non-NULL arrays
    scale_weights(nVtx, vwgts[0], velotab);
  }

  if (nEwgts) {
    edlotab = new SCOTCH_Num[nEdge+1];  // +1 since Scotch wants all procs 
                                         // to have non-NULL arrays
    scale_weights(nEdge, ewgts[0], edlotab);
  }

  // Build PTScotch distributed data structure
  ierr = SCOTCH_dgraphBuild(gr, baseval, vertlocnbr, vertlocmax,
                            vertloctab, vendloctab, velotab, vlblloctab,
                            edgelocnbr, edgelocsize,
                            edgeloctab, edgegsttab, edlotab);

  env->globalInputAssertion(__FILE__, __LINE__, "SCOTCH_dgraphBuild", 
    !ierr, BASIC_ASSERTION, problemComm);

  // Create array for Scotch to return results in.
  ArrayRCP<part_t> partList(new part_t[nVtx], 0, nVtx,true);
  SCOTCH_Num *partloctab = NULL;
  if (nVtx && (sizeof(SCOTCH_Num) == sizeof(part_t))) {
    // Can write directly into the solution's memory
    partloctab = (SCOTCH_Num *) partList.getRawPtr();
  }
  else {
    // Can't use solution memory directly; will have to copy later.
    // Note:  Scotch does not like NULL arrays, so add 1 to always have non-null.
    //        ParMETIS has this same "feature."  See Zoltan bug 4299.
    partloctab = new SCOTCH_Num[nVtx+1];
  }

  // Get target part sizes
  float *partsizes = new float[numGlobalParts];
  if (!solution->criteriaHasUniformPartSizes(0))
    for (size_t i=0; i<numGlobalParts; i++)
      partsizes[i] = solution->getCriteriaPartSize(0, i);
  else
    for (size_t i=0; i<numGlobalParts; i++)
      partsizes[i] = 1.0 / float(numGlobalParts);

  // Allocate and initialize PTScotch target architecture data structure
  SCOTCH_Arch archdat;
  SCOTCH_archInit(&archdat);

  SCOTCH_Num velosum = 0;
  SCOTCH_dgraphSize (gr, &velosum, NULL, NULL, NULL);
  SCOTCH_Num *goalsizes = new SCOTCH_Num[partnbr];
  // TODO: The goalsizes are set as in Zoltan; not sure it is correct there 
  // or here.
  // It appears velosum is global NUMBER of vertices, not global total 
  // vertex weight.  I think we should use the latter.
  // Fix this when we add vertex weights.
  for (SCOTCH_Num i = 0; i < partnbr; i++)
    goalsizes[i] = SCOTCH_Num(ceil(velosum * partsizes[i]));
  delete [] partsizes;

  SCOTCH_archCmpltw(&archdat, partnbr, goalsizes);

  // Call partitioning; result returned in partloctab.
  ierr = SCOTCH_dgraphMap(gr, &archdat, &stratstr, partloctab);

  env->globalInputAssertion(__FILE__, __LINE__, "SCOTCH_dgraphMap", 
    !ierr, BASIC_ASSERTION, problemComm);

  SCOTCH_archExit(&archdat);
  delete [] goalsizes;

  // TODO - metrics

#ifdef SHOW_ZOLTAN2_SCOTCH_MEMORY
  int me = env->comm_->getRank();
#endif

#ifdef HAVE_SCOTCH_ZOLTAN2_GETMEMORYMAX
  if (me == 0){
    size_t scotchBytes = SCOTCH_getMemoryMax();
    std::cout << "Rank " << me << ": Maximum bytes used by Scotch: ";
    std::cout << scotchBytes << std::endl;
  }
#endif

  // Clean up PTScotch
  SCOTCH_dgraphExit(gr);
  free(gr);
  SCOTCH_stratExit(&stratstr);

  // Load answer into the solution.

  if ((sizeof(SCOTCH_Num) != sizeof(part_t)) || (nVtx == 0)) {
    for (size_t i = 0; i < nVtx; i++) partList[i] = partloctab[i];
    delete [] partloctab;
  }

  solution->setParts(partList);

  env->memory("Zoltan2-Scotch: After creating solution");

  // Clean up copies made due to differing data sizes.
  TPL_Traits<SCOTCH_Num, lno_t>::DELETE_TPL_T_ARRAY(&vertloctab);
  TPL_Traits<SCOTCH_Num, gno_t>::DELETE_TPL_T_ARRAY(&edgeloctab);

  if (nVwgts) delete [] velotab;
  if (nEwgts) delete [] edlotab;

#else // DO NOT HAVE_MPI

  // TODO:  Handle serial case with calls to Scotch.
  // TODO:  For now, assign everything to rank 0 and assume only one part.
  // TODO:  Can probably use the code above for loading solution,
  // TODO:  instead of duplicating it here.
  // TODO
  // TODO:  Actual logic should call Scotch when number of processes == 1.
  ArrayView<const gno_t> vtxID;
  ArrayView<StridedData<lno_t, scalar_t> > xyz;
  ArrayView<StridedData<lno_t, scalar_t> > vwgts;
  size_t nVtx = model->getVertexList(vtxID, xyz, vwgts);

  ArrayRCP<part_t> partList(new part_t[nVtx], 0, nVtx, true);
  for (size_t i = 0; i < nVtx; i++) partList[i] = 0;

  solution->setParts(partList);

#endif // DO NOT HAVE_MPI
}
void AlgParMETIS<Adapter>::partition(
    const RCP<PartitioningSolution<Adapter> > &solution
)
{
    HELLO;

    size_t numGlobalParts = solution->getTargetGlobalNumberOfParts();

    int me = problemComm->getRank();
    int np = problemComm->getSize();

    // Get vertex info
    ArrayView<const gno_t> vtxgnos;
    ArrayView<StridedData<lno_t, scalar_t> > vwgts;
    int nVwgt = model->getNumWeightsPerVertex();
    size_t nVtx = model->getVertexList(vtxgnos, vwgts);
    pm_idx_t pm_nVtx;
    TPL_Traits<pm_idx_t,size_t>::ASSIGN(pm_nVtx, nVtx);

    pm_idx_t *pm_vwgts = NULL;
    if (nVwgt) {
        pm_vwgts = new pm_idx_t[nVtx*nVwgt];
        scale_weights(nVtx, vwgts, pm_vwgts);
    }

    // Get edge info
    ArrayView<const gno_t> adjgnos;
    ArrayView<const lno_t> offsets;
    ArrayView<StridedData<lno_t, scalar_t> > ewgts;
    int nEwgt = model->getNumWeightsPerEdge();
    size_t nEdge = model->getEdgeList(adjgnos, offsets, ewgts);

    pm_idx_t *pm_ewgts = NULL;
    if (nEwgt) {
        pm_ewgts = new pm_idx_t[nEdge*nEwgt];
        scale_weights(nEdge, ewgts, pm_ewgts);
    }

    // Convert index types for edges, if needed
    pm_idx_t *pm_offsets;
    TPL_Traits<pm_idx_t,const lno_t>::ASSIGN_ARRAY(&pm_offsets, offsets);
    pm_idx_t *pm_adjs;
    pm_idx_t pm_dummy_adj;
    if (nEdge)
        TPL_Traits<pm_idx_t,const gno_t>::ASSIGN_ARRAY(&pm_adjs, adjgnos);
    else
        pm_adjs = &pm_dummy_adj;  // ParMETIS does not like NULL pm_adjs;


    // Build vtxdist
    pm_idx_t *pm_vtxdist;
    ArrayView<size_t> vtxdist;
    model->getVertexDist(vtxdist);
    TPL_Traits<pm_idx_t,size_t>::ASSIGN_ARRAY(&pm_vtxdist, vtxdist);

    // ParMETIS does not like processors having no vertices.
    // Inspect vtxdist and remove from communicator procs that have no vertices
    RCP<Comm<int> > subcomm;
    MPI_Comm mpicomm;  // Note:  mpicomm is valid only while subcomm is in scope

    if (np > 1) {
        int nKeep = 0;
        Array<int> keepRanks(np);
        for (int i = 0; i < np; i++) {
            if ((pm_vtxdist[i+1] - pm_vtxdist[i]) > 0) {
                keepRanks[nKeep] = i;
                pm_vtxdist[nKeep] = pm_vtxdist[i];
                nKeep++;
            }
        }
        pm_vtxdist[nKeep] = pm_vtxdist[np];
        if (nKeep < np) {
            subcomm = problemComm->createSubcommunicator(keepRanks.view(0,nKeep));
            if (subcomm != Teuchos::null)
                mpicomm = Teuchos::getRawMpiComm(*subcomm);
            else
                mpicomm = MPI_COMM_NULL;
        }
        else {
            mpicomm = Teuchos::getRawMpiComm(*problemComm);
        }
    }
    else {
        mpicomm = Teuchos::getRawMpiComm(*problemComm);
    }

    // Create array for ParMETIS to return results in.
    pm_idx_t *pm_partList = NULL;
    if (nVtx) pm_partList = new pm_idx_t[nVtx];
    for (size_t i = 0; i < nVtx; i++) pm_partList[i] = 0;
    int pm_return = METIS_OK;

    if (mpicomm != MPI_COMM_NULL) {
        // If in ParMETIS' communicator (i.e., have vertices), call ParMETIS

        // Get target part sizes
        pm_idx_t pm_nCon = (nVwgt == 0 ? 1 : pm_idx_t(nVwgt));
        pm_real_t *pm_partsizes = new pm_real_t[numGlobalParts*pm_nCon];
        for (pm_idx_t dim = 0; dim < pm_nCon; dim++) {
            if (!solution->criteriaHasUniformPartSizes(dim))
                for (size_t i=0; i<numGlobalParts; i++)
                    pm_partsizes[i*pm_nCon+dim] =
                        pm_real_t(solution->getCriteriaPartSize(dim,i));
            else
                for (size_t i=0; i<numGlobalParts; i++)
                    pm_partsizes[i*pm_nCon+dim] = pm_real_t(1.)/pm_real_t(numGlobalParts);
        }

        // Get imbalance tolerances
        double tolerance = 1.1;
        const Teuchos::ParameterList &pl = env->getParameters();
        const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
        if (pe) tolerance = pe->getValue<double>(&tolerance);

        // ParMETIS requires tolerance to be greater than 1.0;
        // fudge it if condition is not met
        if (tolerance <= 1.0) {
            if (me == 0)
                std::cerr << "Warning:  ParMETIS requires imbalance_tolerance > 1.0; "
                          << "to comply, Zoltan2 reset imbalance_tolerance to 1.01."
                          << std::endl;
            tolerance = 1.01;
        }

        pm_real_t *pm_imbTols = new pm_real_t[pm_nCon];
        for (pm_idx_t dim = 0; dim < pm_nCon; dim++)
            pm_imbTols[dim] = pm_real_t(tolerance);

        std::string parmetis_method("PARTKWAY");
        pe = pl.getEntryPtr("partitioning_approach");
        if (pe) {
            std::string approach;
            approach = pe->getValue<std::string>(&approach);
            if ((approach == "repartition") || (approach == "maximize_overlap")) {
                if (np > 1)
                    // ParMETIS_V3_AdaptiveRepart requires two or more processors
                    parmetis_method = "ADAPTIVE_REPART";
                else
                    parmetis_method = "REFINE_KWAY";
            }
        }

        // Other ParMETIS parameters?
        pm_idx_t pm_wgtflag = 2*(nVwgt > 0) + (nEwgt > 0);
        pm_idx_t pm_numflag = 0;
        pm_idx_t pm_edgecut = -1;
        pm_idx_t pm_options[METIS_NOPTIONS];
        pm_options[0] = 1;   // Use non-default options for some ParMETIS options
        for (int i = 0; i < METIS_NOPTIONS; i++)
            pm_options[i] = 0; // Default options
        pm_options[2] = 15;  // Matches default value used in Zoltan

        pm_idx_t pm_nPart;
        TPL_Traits<pm_idx_t,size_t>::ASSIGN(pm_nPart, numGlobalParts);

        if (parmetis_method == "PARTKWAY") {
            pm_return = ParMETIS_V3_PartKway(pm_vtxdist, pm_offsets, pm_adjs,
                                             pm_vwgts, pm_ewgts, &pm_wgtflag,
                                             &pm_numflag, &pm_nCon, &pm_nPart,
                                             pm_partsizes, pm_imbTols, pm_options,
                                             &pm_edgecut, pm_partList, &mpicomm);
        }
        else if (parmetis_method == "ADAPTIVE_REPART") {
            // Get object sizes:  pm_vsize
            // TODO:  get pm_vsize info from input adapter or graph model
            // TODO:  This is just a placeholder
            pm_idx_t *pm_vsize = new pm_idx_t[nVtx];
            for (size_t i = 0; i < nVtx; i++) pm_vsize[i] = 1;

            pm_real_t itr = 100.;  // Same default as in Zoltan
            pm_return = ParMETIS_V3_AdaptiveRepart(pm_vtxdist, pm_offsets, pm_adjs,
                                                   pm_vwgts,
                                                   pm_vsize, pm_ewgts, &pm_wgtflag,
                                                   &pm_numflag, &pm_nCon, &pm_nPart,
                                                   pm_partsizes, pm_imbTols,
                                                   &itr, pm_options, &pm_edgecut,
                                                   pm_partList, &mpicomm);
            delete [] pm_vsize;
        }
        else if (parmetis_method == "REFINE_KWAY") {
            pm_return = ParMETIS_V3_RefineKway(pm_vtxdist, pm_offsets, pm_adjs,
                                               pm_vwgts, pm_ewgts, &pm_wgtflag,
                                               &pm_numflag, &pm_nCon, &pm_nPart,
                                               pm_partsizes, pm_imbTols, pm_options,
                                               &pm_edgecut, pm_partList, &mpicomm);
        }

        // Clean up
        delete [] pm_partsizes;
        delete [] pm_imbTols;
    }

    // Load answer into the solution.

    ArrayRCP<part_t> partList;
    if (nVtx)
        TPL_Traits<part_t, pm_idx_t>::SAVE_ARRAYRCP(&partList, pm_partList, nVtx);
    TPL_Traits<pm_idx_t, part_t>::DELETE_ARRAY(&pm_partList);

    solution->setParts(partList);

    env->memory("Zoltan2-ParMETIS: After creating solution");

    // Clean up copies made due to differing data sizes.
    TPL_Traits<pm_idx_t,size_t>::DELETE_ARRAY(&pm_vtxdist);
    TPL_Traits<pm_idx_t,const lno_t>::DELETE_ARRAY(&pm_offsets);
    if (nEdge)
        TPL_Traits<pm_idx_t,const gno_t>::DELETE_ARRAY(&pm_adjs);

    if (nVwgt) delete [] pm_vwgts;
    if (nEwgt) delete [] pm_ewgts;

    if (pm_return != METIS_OK) {
        throw std::runtime_error(
            "\nParMETIS returned an error; no valid partition generated.\n"
            "Look for 'PARMETIS ERROR' in your output for more details.\n");
    }
}
Exemple #5
0
FLOAT
inside_outside(grammar g, const si_t si, FILE *yieldfp, 
	       FILE *tracefp, FILE *summaryfp, int debuglevel,
	       int maxsentlen, int minits, int maxits,
	       FLOAT stoptol, FLOAT minruleprob,
	       FLOAT jitter, int VariationalBayes, FLOAT wordscale,
	       FLOAT annealstart, FLOAT annealstop, int nanneal,
	       int weighted_yields_flag)
{
  FLOAT *rule_counts = CALLOC(g->nrules, sizeof(FLOAT));
  FLOAT sum_neglog_prob0;
  FLOAT sum_neglog_prob;
  int   iteration = 0;
  size_t nrules, nrules0;
  FLOAT sum_yieldweights;
  FLOAT temperature = annealstart;

  nrules = g->nrules;

  if (summaryfp && debuglevel >= 1000) {
    if (debuglevel < 5000)
      fprintf(summaryfp, "# Iteration\ttemperature\tnrules\t-logP\tbits/token\n%d\t%g\t%d", 
	      iteration, temperature, (int) nrules);
    else
      fprintf(summaryfp, "# Iteration %d, temperature = %g, %d rules, ",
	      iteration, temperature, (int) nrules);
    fflush(summaryfp);
  }

  sum_neglog_prob0 = expected_rule_counts(g, si, yieldfp, tracefp, 
					  summaryfp, debuglevel,
					  maxsentlen, minruleprob, wordscale,
					  rule_counts, &sum_yieldweights,
					  weighted_yields_flag);

  if (summaryfp && debuglevel >= 1000) {
    if (debuglevel < 5000)
      fprintf(summaryfp, "\t%g\t%g\n", sum_neglog_prob0,
	      sum_neglog_prob0/(log(2)*(sum_yieldweights)));
    else
      fprintf(summaryfp, "-logP = %g, bits/token = %g.\n", sum_neglog_prob0,
	      sum_neglog_prob0/(log(2)*(sum_yieldweights)));
    fflush(summaryfp);
  }

  if (tracefp && debuglevel >= 10000) {
    write_rule_values(tracefp, g, si, rule_counts, 0);
    fprintf(tracefp, "\n");
    fflush(tracefp);
  }

  if (summaryfp && debuglevel >= 5000 && debuglevel < 10000)
    write_grammar(summaryfp, g, si, minruleprob);      

  while (1) {
    ++iteration;

    add_bias(g, rule_counts);
    set_rule_weights(g, rule_counts, VariationalBayes);
    prune_grammar(g, si, minruleprob);
    if (jitter != 0) 
      jitter_weights(g, jitter);
    set_rule_weights(g, g->weights, 0);
    if (iteration < nanneal) {
      temperature = annealstart*pow(annealstop/annealstart, (iteration-1.0)/(nanneal-1.0));
      scale_weights(g, 1.0/temperature);
    }
    else
      temperature = 1.0;
    nrules0 = nrules;
    nrules = g->nrules;

    if (summaryfp && debuglevel >= 1000) {
      if (debuglevel < 5000)
	fprintf(summaryfp, "%d\t%g\t%d", iteration, temperature, (int) nrules);
      else
	fprintf(summaryfp, "# Iteration %d, temperature %g, %d rules, ",
		iteration, temperature, (int) nrules);
      fflush(summaryfp);
    }

    sum_neglog_prob = expected_rule_counts(g, si, yieldfp, tracefp, summaryfp, debuglevel,
					   maxsentlen, minruleprob, wordscale,
					   rule_counts, &sum_yieldweights, weighted_yields_flag);

    if (summaryfp && debuglevel >= 1000) {
      if (debuglevel < 5000)
	fprintf(summaryfp, "\t%g\t%g\n", sum_neglog_prob,
		sum_neglog_prob/(log(2)*(sum_yieldweights)));
      else
	fprintf(summaryfp, "-logP = %g, bits/token = %g.\n", sum_neglog_prob,
		sum_neglog_prob/(log(2)*(sum_yieldweights)));
      fflush(summaryfp);
    }

    if (tracefp && debuglevel >= 10000) {
      write_rule_values(tracefp, g, si, rule_counts, 0);
      fprintf(tracefp, "\n");
      fflush(tracefp);
    }
    
    if (summaryfp && debuglevel >= 5000 && debuglevel < 10000)
      write_grammar(summaryfp, g, si, minruleprob);      

    if (nrules==nrules0 &&
	iteration >= minits &&
	((maxits > 0 && iteration >= maxits)
	 || (sum_neglog_prob0-sum_neglog_prob)/fabs(sum_neglog_prob) < stoptol))
      break;

    sum_neglog_prob0 = sum_neglog_prob;
  }

  FREE(rule_counts);

  return(sum_neglog_prob/(log(2)*sum_yieldweights));
}
void AlgParMETIS<Adapter>::partition(
  const RCP<PartitioningSolution<Adapter> > &solution
)
{
  HELLO;

  size_t numGlobalParts = solution->getTargetGlobalNumberOfParts();

  int np = problemComm->getSize();

  // Get vertex info
  ArrayView<const gno_t> vtxgnos;
  ArrayView<StridedData<lno_t, scalar_t> > xyz;
  ArrayView<StridedData<lno_t, scalar_t> > vwgts;
  int nVwgt = model->getNumWeightsPerVertex();
  size_t nVtx = model->getVertexList(vtxgnos, xyz, vwgts);
  pm_idx_t pm_nVtx;
  TPL_Traits<pm_idx_t,size_t>::ASSIGN_TPL_T(pm_nVtx, nVtx, env);

  pm_idx_t *pm_vwgts = NULL;
  if (nVwgt) {
    pm_vwgts = new pm_idx_t[nVtx*nVwgt];
    scale_weights(nVtx, vwgts, pm_vwgts);
  }

  // Get edge info
  ArrayView<const gno_t> adjgnos;
  ArrayView<const int>   procs;
  ArrayView<const lno_t> offsets;
  ArrayView<StridedData<lno_t, scalar_t> > ewgts;
  int nEwgt = model->getNumWeightsPerEdge();
  size_t nEdge = model->getEdgeList(adjgnos, procs, offsets, ewgts);

  pm_idx_t *pm_ewgts = NULL;
  if (nEwgt) {
    pm_ewgts = new pm_idx_t[nEdge*nEwgt]; 
    scale_weights(nEdge, ewgts, pm_ewgts);
  }

  // Convert index types for edges, if needed
  pm_idx_t *pm_offsets;  
  TPL_Traits<pm_idx_t,lno_t>::ASSIGN_TPL_T_ARRAY(&pm_offsets, offsets, env);
  pm_idx_t *pm_adjs;  
  TPL_Traits<pm_idx_t,gno_t>::ASSIGN_TPL_T_ARRAY(&pm_adjs, adjgnos, env);

  // Build vtxdist
  pm_idx_t *pm_vtxdist = new pm_idx_t[np+1];
  pm_vtxdist[0] = 0;
  Teuchos::gatherAll(*problemComm, 1, &pm_nVtx, np, &(pm_vtxdist[1]));
  for (int i = 2; i <= np; i++)
    pm_vtxdist[i] += pm_vtxdist[i-1];

  // Create array for ParMETIS to return results in.
  // Note:  ParMETIS does not like NULL arrays,
  //        so add 1 to always have non-null.
  //        See Zoltan bug 4299.
  pm_idx_t *pm_partList = new pm_idx_t[nVtx+1];

  // Get target part sizes and imbalance tolerances

  pm_idx_t pm_nCon = (nVwgt == 0 ? 1 : pm_idx_t(nVwgt));
  pm_real_t *pm_partsizes = new pm_real_t[numGlobalParts*pm_nCon];
  for (pm_idx_t dim = 0; dim < pm_nCon; dim++) {
    if (!solution->criteriaHasUniformPartSizes(dim))
      for (size_t i=0; i<numGlobalParts; i++)
        pm_partsizes[i*pm_nCon+dim] = 
                     pm_real_t(solution->getCriteriaPartSize(dim,i));
    else
      for (size_t i=0; i<numGlobalParts; i++)
        pm_partsizes[i*pm_nCon+dim] = pm_real_t(1.) / pm_real_t(numGlobalParts);
  }
  pm_real_t *pm_imbTols = new pm_real_t[pm_nCon];
  for (pm_idx_t dim = 0; dim < pm_nCon; dim++)
    pm_imbTols[dim] = 1.05;  // TODO:  GET THE PARAMETER

  std::string parmetis_method("PARTKWAY");
  pm_idx_t pm_wgtflag = 2*(nVwgt > 0) + (nEwgt > 0);
  pm_idx_t pm_numflag = 0;

  pm_idx_t pm_nPart;
  TPL_Traits<pm_idx_t,size_t>::ASSIGN_TPL_T(pm_nPart, numGlobalParts, env);

  if (parmetis_method == "PARTKWAY") {

    pm_idx_t pm_edgecut = -1;
    pm_idx_t pm_options[3];
    pm_options[0] = 0;   // Use default options
    pm_options[1] = 0;   // Debug level (ignored if pm_options[0] == 0)
    pm_options[2] = 0;   // Seed (ignored if pm_options[0] == 0)

    ParMETIS_V3_PartKway(pm_vtxdist, pm_offsets, pm_adjs, pm_vwgts, pm_ewgts,
                         &pm_wgtflag, &pm_numflag, &pm_nCon, &pm_nPart,
                         pm_partsizes, pm_imbTols, pm_options,
                         &pm_edgecut, pm_partList, &mpicomm);
  }
  else if (parmetis_method == "ADAPTIVE_REPART") {
    // Get object sizes
    std::cout << "NOT READY FOR ADAPTIVE_REPART YET" << std::endl;
    exit(-1);
  }
  else if (parmetis_method == "PART_GEOM") {
    // Get coordinate info, too.
    std::cout << "NOT READY FOR PART_GEOM YET" << std::endl;
    exit(-1);
  }

  // Clean up 
  delete [] pm_vtxdist;
  delete [] pm_partsizes;
  delete [] pm_imbTols;

  // Load answer into the solution.

  ArrayRCP<part_t> partList;
  if (TPL_Traits<pm_idx_t, part_t>::OK_TO_CAST_TPL_T()) {
    partList = ArrayRCP<part_t>((part_t *)pm_partList, 0, nVtx, true);
  }
  else {
    // TODO Probably should have a TPL_Traits function to do the following
    partList = ArrayRCP<part_t>(new part_t[nVtx], 0, nVtx, true);
    for (size_t i = 0; i < nVtx; i++) {
      partList[i] = part_t(pm_partList[i]);
    }
    delete [] pm_partList;
  }

  solution->setParts(partList);

  env->memory("Zoltan2-ParMETIS: After creating solution");

  // Clean up copies made due to differing data sizes.
  TPL_Traits<pm_idx_t,lno_t>::DELETE_TPL_T_ARRAY(&pm_offsets);
  TPL_Traits<pm_idx_t,gno_t>::DELETE_TPL_T_ARRAY(&pm_adjs);

  if (nVwgt) delete [] pm_vwgts;
  if (nEwgt) delete [] pm_ewgts;
}