Example #1
0
static void SendSpanningChildren(int size, char *msg, int rankToAssign, int startNode) {
#if CMK_BROADCAST_SPANNING_TREE
    int i, oldRank;
    char *newmsg;

    oldRank = CMI_DEST_RANK(msg);
    /* doing this is to avoid the multiple assignment in the following for loop */
    CMI_DEST_RANK(msg) = rankToAssign;
    /* first send msgs to other nodes */
    CmiAssert(startNode >=0 &&  startNode<CmiNumNodes());
    for (i=1; i<=BROADCAST_SPANNING_FACTOR; i++) {
        int nd = CmiMyNode()-startNode;
        if (nd<0) nd+=CmiNumNodes();
        nd = BROADCAST_SPANNING_FACTOR*nd + i;
        if (nd > CmiNumNodes() - 1) break;
        nd += startNode;
        nd = nd%CmiNumNodes();
        CmiAssert(nd>=0 && nd!=CmiMyNode());
#if CMK_BROADCAST_USE_CMIREFERENCE
        CmiReference(msg);
        CmiSendNetworkFunc(CmiNodeFirst(nd), size, msg, BCAST_SYNC);
#else
        newmsg = CopyMsg(msg, size);
        CmiSendNetworkFunc(CmiNodeFirst(nd), size, newmsg, BCAST_SYNC);
#endif
    }
    CMI_DEST_RANK(msg) = oldRank;
#endif
}
Example #2
0
void Communicate::sendMessage(int PE, void *msg, int size)
{
  if ( CmiMyPe() ) NAMD_bug("Communicate::sendMessage not from Pe 0");

  while ( CkpvAccess(CsmAcks) < nchildren ) {
    CmiDeliverMsgs(0);
  }
  CkpvAccess(CsmAcks) = 0;

  CmiSetHandler(msg, CsmHandlerIndex);
  switch(PE) {
    case ALL:
      NAMD_bug("Unexpected Communicate::sendMessage(ALL,...)");
      //CmiSyncBroadcastAll(size, (char *)msg);
      break;
    case ALLBUTME:
      //CmiSyncBroadcast(size, (char *)msg);
      if ( CmiNumNodes() > 2 ) {
        CmiSyncSend(CmiNodeFirst(2),size,(char*)msg);
      }
      if ( CmiNumNodes() > 1 ) {
        CmiSyncSend(CmiNodeFirst(1),size,(char*)msg);
      }
      break;
    default:
      NAMD_bug("Unexpected Communicate::sendMessage(PEL,...)");
      //CmiSyncSend(PE, size, (char *)msg);
      break;
  }
}
Example #3
0
void *Communicate::getMessage(int PE, int tag)
{
  if ( CmiMyRank() ) NAMD_bug("Communicate::getMessage called on non-rank-zero Pe\n");

  int itag[2], rtag[2];
  void *msg;

  itag[0] = (PE==(-1)) ? (CmmWildCard) : PE;
  itag[1] = (tag==(-1)) ? (CmmWildCard) : tag;
  while((msg=CmmGet(CkpvAccess(CsmMessages),2,itag,rtag))==0) {
    CmiDeliverMsgs(0);
  }

  char *ackmsg = (char *) CmiAlloc(CmiMsgHeaderSizeBytes);
  CmiSetHandler(ackmsg, CsmAckHandlerIndex);
  CmiSyncSend(CmiNodeFirst((CmiMyNode()-1)/2), CmiMsgHeaderSizeBytes, ackmsg);

  while ( CkpvAccess(CsmAcks) < nchildren ) {
    CmiDeliverMsgs(0);
  }
  CkpvAccess(CsmAcks) = 0;

  int size = SIZEFIELD(msg);
  for ( int i = 2; i >= 1; --i ) {
    int node = CmiMyNode() * 2 + i;
    if ( node < CmiNumNodes() ) {
      CmiSyncSend(CmiNodeFirst(node),size,(char*)msg);
    }
  }

  return msg;
}
Example #4
0
void BGQTorusManager::populateLocalNodes() {
  if(CmiNumPartitions() == 1) return;

  CmiLock(bgq_lock);
  if(bgq_isLocalSet) {
    CmiUnlock(bgq_lock);
    return;
  }

  if(bgq_localNodes == NULL)
    bgq_localNodes = (int *)malloc(CmiNumNodesGlobal()*sizeof(int));

  CmiAssert(bgq_localNodes != NULL);

  for(int i = 0; i < CmiNumNodesGlobal(); i++)
    bgq_localNodes[i] = -1;

  for(int i = 0; i < CmiNumNodes(); i++) {
    int a, b, c, d, e, t;
    int global;

    rankToCoordinates(CmiNodeFirst(i), a, b, c, d, e, t);
    global = CmiNodeOf(coordinatesToRank(a, b, c, d, e, t));

    bgq_localNodes[global] = i;
  }

  bgq_isLocalSet = 1;

  CmiUnlock(bgq_lock);
}
Example #5
0
static CmiCommHandle MachineSendFuncForLAPI(int destNode, int size, char *msg, int mode) {
    scompl_hndlr_t *shdlr = NULL;
    void *sinfo = NULL;

    if (mode==P2P_SYNC) {
        shdlr = ReleaseMsg;
        sinfo = (void *)msg;
    } else if (mode==P2P_ASYNC) {
        shdlr = DeliveredMsg;
        sinfo = malloc(sizeof(int));
        *((int *)sinfo) = 1;
    }

    CMI_MSG_SIZE(msg) = size;

#if ENSURE_MSG_PAIRORDER
#if CMK_NODE_QUEUE_AVAILABLE
    if (CMI_DEST_RANK(msg) == DGRAM_NODEMESSAGE) {
        lapiSendFn(destNode, size, msg, shdlr, sinfo);
        return sinfo;
    }
#endif
    int destPE = CmiNodeFirst(destNode)+CMI_DEST_RANK(msg);
    CMI_MSG_SRCPE(msg) = CmiMyPe();
    /* Note: This could be executed on comm threads, where CmiMyPe() >= CmiNumPes() */
    CMI_MSG_SEQNO(msg) = getNextMsgSeqNo(CpvAccess(p2pMsgSeqInfo).nextMsgSeqNo, destPE);
    setNextMsgSeqNo(CpvAccess(p2pMsgSeqInfo).nextMsgSeqNo, destPE, CMI_MSG_SEQNO(msg));
#endif

    lapiSendFn(destNode, size, msg, shdlr, sinfo);
    return sinfo;
}
Example #6
0
static void SendHyperCube(int size,  char *msg, int rankToAssign, int startNode) {
#if CMK_BROADCAST_HYPERCUBE
    int i, cnt, tmp, relDist, oldRank;
    const int dims=CmiNodesDim;

    oldRank = CMI_DEST_RANK(msg);
    /* doing this is to avoid the multiple assignment in the following for loop */
    CMI_DEST_RANK(msg) = rankToAssign;

    /* first send msgs to other nodes */
    relDist = CmiMyNode()-startNode;
    if (relDist < 0) relDist += CmiNumNodes();

    /* Sending scheme example: say we have 9 nodes, and the msg is sent from 0
     * The overall sending steps will be as follows:
     * 0-->8, 0-->4, 0-->2, 0-->1
     *               4-->6, 4-->5
     *                      2-->3
     *                      6-->7
     * So for node id as N=A+2^B, it will forward the broadcast (B-1) msg to in
     * the order as: N+2^(B-1), N+2^(B-2),..., N+1 except node 0, where B is
     * the first position of bit 1 in the binary format of the number of N
     * counting from the right with count starting from 0.
     * On node 0, the value "B" should be CmiNodesDim
     */
    /* Calculate 2^B */
    if(relDist==0) cnt = 1<<dims;
    else cnt = relDist & ((~relDist)+1);
    /*CmiPrintf("ND[%d]: send bcast msg with cnt=%d\n", CmiMyNode(), cnt);*/
    /* Begin to send msgs */
    for(cnt>>=1; cnt>0; cnt>>=1){
        int nd = relDist + cnt;
        if (nd >= CmiNumNodes()) continue;
        nd = (nd+startNode)%CmiNumNodes();
        /*CmiPrintf("ND[%d]: send to node %d\n", CmiMyNode(), nd);*/
        CmiAssert(nd>=0 && nd!=CmiMyNode());
#if CMK_BROADCAST_USE_CMIREFERENCE
        CmiReference(msg);
        CmiSendNetworkFunc(CmiNodeFirst(nd), size, msg, BCAST_SYNC);
#else
        char *newmsg = CopyMsg(msg, size);
        CmiSendNetworkFunc(CmiNodeFirst(nd), size, newmsg, BCAST_SYNC);
#endif
    }
    CMI_DEST_RANK(msg) = oldRank;
#endif
}
Example #7
0
/**
 * This function is used to send other processors on the same node a signal so
 * they can check if their _initDone can be called: the reason for this is that
 * the check at the end of _initHandler can fail due to a missing message containing
 * a Nodegroup creation. When that message arrives only one processor will receive
 * it, and thus if no notification is sent to the other processors in the node, they
 * will never proceed.
 */
static void _sendTriggers(void)
{
  int i, num, first;
  CmiImmediateLock(CksvAccess(_nodeGroupTableImmLock));
  if (CksvAccess(_triggersSent) == 0)
  {
    CksvAccess(_triggersSent)++;
    num = CmiMyNodeSize();
    register envelope *env = _allocEnv(RODataMsg); // Notice that the type here is irrelevant
    env->setSrcPe(CkMyPe());
    CmiSetHandler(env, _triggerHandlerIdx);
    first = CmiNodeFirst(CmiMyNode());
    for (i=0; i < num; i++)
      if(first+i != CkMyPe())
	CmiSyncSend(first+i, env->getTotalsize(), (char *)env);
    CmiFree(env);
  }
  CmiImmediateUnlock(CksvAccess(_nodeGroupTableImmLock));
}
Example #8
0
void pingpong_moduleinit(void)
{
  int i,j;
  pvi(int, numRecv);
  pva(numRecv) = 0;
  pvi(int, nextIter);
  pva(nextIter) = -1;
  pvi(int, nextSize);
  pva(nextSize) = -1;
  pvi(int, nextNbr);
  pva(nextNbr) = -1;
  pvi(double, starttime);
  pva(starttime) = 0.0;
  pvi(double, endtime);
  pva(endtime) = 0.0;
  pvi(int, numSizes);
  for(i=0; sizes[i].size != (-1); i++);
  pva(numSizes) = i;
  pvi(double **, times);
  pva(times) = (double **) malloc(CmiNumNodes()*sizeof(double *));
  for(i=0;i<CmiNumNodes();i++)
    pva(times)[i] = (double *) malloc(pva(numSizes)*sizeof(double));
  for(i=0;i<CmiNumNodes();i++)
    for(j=0;j<pva(numSizes);j++)
      pva(times)[i][j] = 0.0;
  pvi(int *, nodeList);
  pva(nodeList) = (int *) malloc(CmiNumNodes()*sizeof(int));
  for(i=0;i<CmiNumNodes();i++)
    pva(nodeList)[i] = CmiNodeFirst(i);
  pvi(double *, gavg);
  pva(gavg) = (double *) malloc(sizeof(double)*pva(numSizes));
  pvi(double *, gmax);
  pva(gmax) = (double *) malloc(sizeof(double)*pva(numSizes));
  pvi(double *, gmin);
  pva(gmin) = (double *) malloc(sizeof(double)*pva(numSizes));
  pvi(int *, gmaxSrc);
  pva(gmaxSrc) = (int *) malloc(sizeof(int)*pva(numSizes));
  pvi(int *, gmaxDest);
  pva(gmaxDest) = (int *) malloc(sizeof(int)*pva(numSizes));
  pvi(int *, gminSrc);
  pva(gminSrc) = (int *) malloc(sizeof(int)*pva(numSizes));
  pvi(int *, gminDest);
  pva(gminDest) = (int *) malloc(sizeof(int)*pva(numSizes));
  for(i=0;i<pva(numSizes);i++) {
    pva(gavg)[i] = 0.0;
    pva(gmax)[i] = 0.0;
    pva(gmin)[i] = 1000000000.0;
    pva(gmaxSrc)[i] = 0;
    pva(gmaxDest)[i] = 0;
    pva(gminSrc)[i] = 0;
    pva(gminDest)[i] = 0;
  }
  pvi(int, timeHandler);
  pva(timeHandler) = CmiRegisterHandler((CmiHandler)recvTime);
  pvi(int, nodeHandler);
  pva(nodeHandler) = CmiRegisterHandler((CmiHandler)startNextNode);
  pvi(int, nbrHandler);
  pva(nbrHandler) = CmiRegisterHandler((CmiHandler)startNextNbr);
  pvi(int, sizeHandler);
  pva(sizeHandler) = CmiRegisterHandler((CmiHandler)startNextSize);
  pvi(int, iterHandler);
  pva(iterHandler) = CmiRegisterHandler((CmiHandler)startNextIter);
  pvi(int, bounceHandler);
  pva(bounceHandler) = CmiRegisterHandler((CmiHandler)bounceMessage);
  pvi(int, setupHandler);
  pva(setupHandler) = CmiRegisterHandler((CmiHandler)setupMessage);
  pvi(int, startHandler);
  pva(startHandler) = CmiRegisterHandler((CmiHandler)startMessage);
}
Example #9
0
void CmiInitCPUAffinity(char **argv)
{
  static skt_ip_t myip;
  int ret, i, exclude;
  hostnameMsg  *msg;
  char *pemap = NULL;
  char *commap = NULL;
  char *pemapfile = NULL;
 
  int show_affinity_flag;
  int affinity_flag = CmiGetArgFlagDesc(argv,"+setcpuaffinity",
						"set cpu affinity");

  while (CmiGetArgIntDesc(argv,"+excludecore", &exclude, "avoid core when setting cpuaffinity"))  {
    if (CmiMyRank() == 0) add_exclude(exclude);
    affinity_flag = 1;
  }

  if (CmiGetArgStringDesc(argv, "+pemapfile", &pemapfile, "define pe to core mapping file")) {
    FILE *fp;
    char buf[128];
    pemap = (char*)malloc(1024);
    fp = fopen(pemapfile, "r");
    if (fp == NULL) CmiAbort("pemapfile does not exist");
    while (!feof(fp)) {
      if (fgets(buf, 128, fp)) {
        if (buf[strlen(buf)-1] == '\n') buf[strlen(buf)-1] = 0;
        strcat(pemap, buf);
      }
    }
    fclose(fp);
    if (CmiMyPe()==0) CmiPrintf("Charm++> read from pemap file '%s': %s\n", pemapfile, pemap);
  }

  CmiGetArgStringDesc(argv, "+pemap", &pemap, "define pe to core mapping");
  if (pemap!=NULL && excludecount>0)
    CmiAbort("Charm++> +pemap can not be used with +excludecore.\n");

  CmiGetArgStringDesc(argv, "+commap", &commap, "define comm threads to core mapping");

  if (pemap!=NULL || commap!=NULL) affinity_flag = 1;

  show_affinity_flag = CmiGetArgFlagDesc(argv,"+showcpuaffinity",
						"print cpu affinity");

  cpuAffinityHandlerIdx =
       CmiRegisterHandler((CmiHandler)cpuAffinityHandler);
  cpuAffinityRecvHandlerIdx =
       CmiRegisterHandler((CmiHandler)cpuAffinityRecvHandler);

  if (CmiMyRank() ==0) {
     affLock = CmiCreateLock();
  }

#if CMK_BLUEGENEP || CMK_BLUEGENEQ
  if(affinity_flag){
      affinity_flag = 0;
      if(CmiMyPe()==0) CmiPrintf("Charm++> cpu affinity setting is not needed on Blue Gene, thus ignored.\n");
  }
  if(show_affinity_flag){
      show_affinity_flag = 0;
      if(CmiMyPe()==0) CmiPrintf("Charm++> printing cpu affinity is not supported on Blue Gene.\n");
  }
#endif

  if (!affinity_flag) {
    if (show_affinity_flag) CmiPrintCPUAffinity();
    return;
  }

  if (CmiMyPe() == 0) {
     CmiPrintf("Charm++> cpu affinity enabled. \n");
     if (excludecount > 0) {
       CmiPrintf("Charm++> cpuaffinity excludes core: %d", excludecore[0]);
       for (i=1; i<excludecount; i++) CmiPrintf(" %d", excludecore[i]);
       CmiPrintf(".\n");
     }
     if (pemap!=NULL)
       CmiPrintf("Charm++> cpuaffinity PE-core map : %s\n", pemap);
  }

  if (CmiMyPe() >= CmiNumPes()) {         /* this is comm thread */
      /* comm thread either can float around, or pin down to the last rank.
         however it seems to be reportedly slower if it is floating */
    CmiNodeAllBarrier();
    if (commap != NULL) {
      int mycore = search_pemap(commap, CmiMyPeGlobal()-CmiNumPesGlobal());
      if(CmiMyPe()-CmiNumPes()==0) printf("Charm++> set comm %d on node %d to core #%d\n", CmiMyPe()-CmiNumPes(), CmiMyNode(), mycore); 
      if (-1 == CmiSetCPUAffinity(mycore))
        CmiAbort("set_cpu_affinity abort!");
      CmiNodeAllBarrier();
      if (show_affinity_flag) CmiPrintCPUAffinity();
      return;    /* comm thread return */
    }
    else {
    /* if (CmiSetCPUAffinity(CmiNumCores()-1) == -1) CmiAbort("set_cpu_affinity abort!"); */
#if !CMK_CRAYXT && !CMK_CRAYXE && !CMK_CRAYXC && !CMK_BLUEGENEQ
      if (pemap == NULL) {
#if CMK_MACHINE_PROGRESS_DEFINED
        while (affinity_doneflag < CmiMyNodeSize())  CmiNetworkProgress();
#else
#if CMK_SMP
       #error "Machine progress call needs to be implemented for cpu affinity!"
#endif
#endif
      }
#endif
#if CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC
      /* if both pemap and commmap are NULL, will compute one */
      if (pemap != NULL)      
#endif
      {
      CmiNodeAllBarrier();
      if (show_affinity_flag) CmiPrintCPUAffinity();
      return;    /* comm thread return */
      }
    }
  }

  if (pemap != NULL && CmiMyPe()<CmiNumPes()) {    /* work thread */
    int mycore = search_pemap(pemap, CmiMyPeGlobal());
    if(show_affinity_flag) CmiPrintf("Charm++> set PE %d on node %d to core #%d\n", CmiMyPe(), CmiMyNode(), mycore); 
    if (mycore >= CmiNumCores()) {
      CmiPrintf("Error> Invalid core number %d, only have %d cores (0-%d) on the node. \n", mycore, CmiNumCores(), CmiNumCores()-1);
      CmiAbort("Invalid core number");
    }
    if (CmiSetCPUAffinity(mycore) == -1) CmiAbort("set_cpu_affinity abort!");
    CmiNodeAllBarrier();
    CmiNodeAllBarrier();
    /* if (show_affinity_flag) CmiPrintCPUAffinity(); */
    return;
  }

#if CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC
  {
    int numCores = CmiNumCores();

    int myid = getXTNodeID(CmiMyNodeGlobal(), CmiNumNodesGlobal());
    int myrank;
    int pe, mype = CmiMyPeGlobal();
    int node = CmiMyNodeGlobal();
    int nnodes = 0;
#if CMK_SMP
    if (CmiMyPe() >= CmiNumPes()) {         /* this is comm thread */
      int node = CmiMyPe() - CmiNumPes();
      mype = CmiGetPeGlobal(CmiNodeFirst(node) + CmiMyNodeSize() - 1, CmiMyPartition()); /* last pe on SMP node */
      node = CmiGetNodeGlobal(node, CmiMyPartition());
    }
#endif
    pe = mype - 1;
    while (pe >= 0) {
      int n = CmiNodeOf(pe);
      if (n != node) { nnodes++; node = n; }
      if (getXTNodeID(n, CmiNumNodesGlobal()) != myid) break;
      pe --;
    }
    CmiAssert(numCores > 0);
    myrank = (mype - pe - 1 + nnodes)%numCores;
#if CMK_SMP
    if (CmiMyPe() >= CmiNumPes()) 
        myrank = (myrank + 1)%numCores;
#endif

    if (-1 != CmiSetCPUAffinity(myrank)) {
      DEBUGP(("Processor %d is bound to core #%d on node #%d\n", CmiMyPe(), myrank, mynode));
    }
    else{
      CmiPrintf("Processor %d set affinity failed!\n", CmiMyPe());
      CmiAbort("set cpu affinity abort!\n");
    }
  }
  if (CmiMyPe() < CmiNumPes()) 
  CmiNodeAllBarrier();
  CmiNodeAllBarrier();
#else
    /* get my ip address */
  if (CmiMyRank() == 0)
  {
#if CMK_HAS_GETHOSTNAME
    myip = skt_my_ip();        /* not thread safe, so only calls on rank 0 */
#else
    CmiAbort("Can not get unique name for the compute nodes. \n");
#endif
  }
  CmiNodeAllBarrier();

    /* prepare a msg to send */
  msg = (hostnameMsg *)CmiAlloc(sizeof(hostnameMsg));
  CmiSetHandler((char *)msg, cpuAffinityHandlerIdx);
  msg->pe = CmiMyPe();
  msg->ip = myip;
  msg->ncores = CmiNumCores();
  DEBUGP(("PE %d's node has %d number of cores. \n", CmiMyPe(), msg->ncores));
  msg->rank = 0;
  CmiSyncSendAndFree(0, sizeof(hostnameMsg), (void *)msg);

  if (CmiMyPe() == 0) {
    int i;
    hostTable = CmmNew();
    rankmsg = (rankMsg *)CmiAlloc(sizeof(rankMsg)+CmiNumPes()*sizeof(int)*2);
    CmiSetHandler((char *)rankmsg, cpuAffinityRecvHandlerIdx);
    rankmsg->ranks = (int *)((char*)rankmsg + sizeof(rankMsg));
    rankmsg->nodes = (int *)((char*)rankmsg + sizeof(rankMsg) + CmiNumPes()*sizeof(int));
    for (i=0; i<CmiNumPes(); i++) {
      rankmsg->ranks[i] = 0;
      rankmsg->nodes[i] = -1;
    }

    for (i=0; i<CmiNumPes(); i++) CmiDeliverSpecificMsg(cpuAffinityHandlerIdx);
  }

    /* receive broadcast from PE 0 */
  CmiDeliverSpecificMsg(cpuAffinityRecvHandlerIdx);
  CmiLock(affLock);
  affinity_doneflag++;
  CmiUnlock(affLock);
  CmiNodeAllBarrier();
#endif

  if (show_affinity_flag) CmiPrintCPUAffinity();
}
Example #10
0
void TorusLB::strategy() {
  int index;
  // compute the average load by (compute load + background load) / numPesAvailable
  computeAverage();
  // two heaps of self and pair computes
  makeTwoHeaps();

  const int beginGroup = processors[0].Id;
  const int endGroup = beginGroup + P;
#define INGROUP(PROC) ((PROC) >= beginGroup && (PROC) < endGroup)

  computeInfo *c;
  processorInfo *p, *minp;
  Iterator nextP;
  overLoad = 1.2;

  for(int I=0; I<numComputes; I++) {

  c = (computeInfo *) computePairHeap->deleteMax();
  if ( ! c ) c = (computeInfo *) computeSelfHeap->deleteMax(); 

  if(c->processor != -1) continue; // go to the next compute
  if(!c) CkAbort("TorusLB: Compute Heap empty!\n");

  for(int j=0; j<6; j++) {
    bestPe[j] = 0;
    goodPe[j] = 0;
    badPe[j] = 0;
  }

  // Look at pes which have the compute's patches

  // HYBRID check if processor is in local group
#define SELECT_REALPE(X) if INGROUP((X)) { \
  selectPes(&processors[(X) - beginGroup], c); \
  }

  const int realPe1 = patches[c->patch1].processor;
  SELECT_REALPE(realPe1)

  const int realPe2 = patches[c->patch2].processor;
  if ( realPe2 != realPe1 ) {
    SELECT_REALPE(realPe2)
  }

  // Try the processors which have the patches' proxies
  p = (processorInfo *)(patches[c->patch1].proxiesOn.iterator((Iterator *)&nextP));
  while(p) {						// patch 1
    if INGROUP(p->Id) selectPes(p, c);
    p = (processorInfo *)(patches[c->patch1].proxiesOn.next((Iterator *)&nextP));
  } 

  p = (processorInfo *)(patches[c->patch2].proxiesOn.iterator((Iterator *)&nextP));
  while(p) {						// patch 2
    if INGROUP(p->Id) selectPes(p, c);
    p = (processorInfo *)(patches[c->patch2].proxiesOn.next((Iterator *)&nextP));
  }

  // see if we have found a processor to place the compute on
  p = 0;
  if((p = bestPe[5])
#if USE_TOPOMAP
  || (p = goodPe[5])
#endif
  || (p = bestPe[4])
#if USE_TOPOMAP
  || (p = goodPe[4])
#endif
  || (p = bestPe[3])
#if USE_TOPOMAP
  || (p = goodPe[3])
#endif
  || (p = bestPe[1])
#if USE_TOPOMAP
  || (p = goodPe[1])
#endif
  || (p = bestPe[2])
#if USE_TOPOMAP
  || (p = goodPe[2])
#endif
  || (p = bestPe[0])
#if USE_TOPOMAP
  || (p = goodPe[0])
#endif
  ) {
    assign(c, p);
    continue;
  }

    // Try all pes on the nodes of the home patches
    if ( CmiNumNodes() > 1 ) {  // else not useful
      double minLoad = overLoad * averageLoad;
      minp = 0;
      int realNode1 = CmiNodeOf(realPe1);
      int nodeSize = CmiNodeSize(realNode1);
      if ( nodeSize > 1 ) {  // else did it already
        int firstpe = CmiNodeFirst(realNode1);
        for ( int rpe = firstpe; rpe < firstpe+nodeSize; ++rpe ) {
          if INGROUP(rpe) {
            p = &processors[rpe - beginGroup];
            if ( p->available && ( p->load + c->load < minLoad ) ) {
              minLoad = p->load + c->load;
              minp = p;
            }
          }
        }
      }
      if ( realPe2 != realPe1 ) {
        int realNode2 = CmiNodeOf(realPe2);
        if ( realNode2 != realNode1 ) {  // else did it already
          nodeSize = CmiNodeSize(realNode2);
          if ( nodeSize > 1 ) {
            int firstpe = CmiNodeFirst(realNode2);
            for ( int rpe = firstpe; rpe < firstpe+nodeSize; ++rpe ) {
              if INGROUP(rpe) {
                p = &processors[rpe - beginGroup];
                if ( p->available && ( p->load + c->load < minLoad ) ) {
                  minLoad = p->load + c->load;
                  minp = p;
                }
              }
            }
          }