static void SendSpanningChildren(int size, char *msg, int rankToAssign, int startNode) { #if CMK_BROADCAST_SPANNING_TREE int i, oldRank; char *newmsg; oldRank = CMI_DEST_RANK(msg); /* doing this is to avoid the multiple assignment in the following for loop */ CMI_DEST_RANK(msg) = rankToAssign; /* first send msgs to other nodes */ CmiAssert(startNode >=0 && startNode<CmiNumNodes()); for (i=1; i<=BROADCAST_SPANNING_FACTOR; i++) { int nd = CmiMyNode()-startNode; if (nd<0) nd+=CmiNumNodes(); nd = BROADCAST_SPANNING_FACTOR*nd + i; if (nd > CmiNumNodes() - 1) break; nd += startNode; nd = nd%CmiNumNodes(); CmiAssert(nd>=0 && nd!=CmiMyNode()); #if CMK_BROADCAST_USE_CMIREFERENCE CmiReference(msg); CmiSendNetworkFunc(CmiNodeFirst(nd), size, msg, BCAST_SYNC); #else newmsg = CopyMsg(msg, size); CmiSendNetworkFunc(CmiNodeFirst(nd), size, newmsg, BCAST_SYNC); #endif } CMI_DEST_RANK(msg) = oldRank; #endif }
void Communicate::sendMessage(int PE, void *msg, int size) { if ( CmiMyPe() ) NAMD_bug("Communicate::sendMessage not from Pe 0"); while ( CkpvAccess(CsmAcks) < nchildren ) { CmiDeliverMsgs(0); } CkpvAccess(CsmAcks) = 0; CmiSetHandler(msg, CsmHandlerIndex); switch(PE) { case ALL: NAMD_bug("Unexpected Communicate::sendMessage(ALL,...)"); //CmiSyncBroadcastAll(size, (char *)msg); break; case ALLBUTME: //CmiSyncBroadcast(size, (char *)msg); if ( CmiNumNodes() > 2 ) { CmiSyncSend(CmiNodeFirst(2),size,(char*)msg); } if ( CmiNumNodes() > 1 ) { CmiSyncSend(CmiNodeFirst(1),size,(char*)msg); } break; default: NAMD_bug("Unexpected Communicate::sendMessage(PEL,...)"); //CmiSyncSend(PE, size, (char *)msg); break; } }
void *Communicate::getMessage(int PE, int tag) { if ( CmiMyRank() ) NAMD_bug("Communicate::getMessage called on non-rank-zero Pe\n"); int itag[2], rtag[2]; void *msg; itag[0] = (PE==(-1)) ? (CmmWildCard) : PE; itag[1] = (tag==(-1)) ? (CmmWildCard) : tag; while((msg=CmmGet(CkpvAccess(CsmMessages),2,itag,rtag))==0) { CmiDeliverMsgs(0); } char *ackmsg = (char *) CmiAlloc(CmiMsgHeaderSizeBytes); CmiSetHandler(ackmsg, CsmAckHandlerIndex); CmiSyncSend(CmiNodeFirst((CmiMyNode()-1)/2), CmiMsgHeaderSizeBytes, ackmsg); while ( CkpvAccess(CsmAcks) < nchildren ) { CmiDeliverMsgs(0); } CkpvAccess(CsmAcks) = 0; int size = SIZEFIELD(msg); for ( int i = 2; i >= 1; --i ) { int node = CmiMyNode() * 2 + i; if ( node < CmiNumNodes() ) { CmiSyncSend(CmiNodeFirst(node),size,(char*)msg); } } return msg; }
void BGQTorusManager::populateLocalNodes() { if(CmiNumPartitions() == 1) return; CmiLock(bgq_lock); if(bgq_isLocalSet) { CmiUnlock(bgq_lock); return; } if(bgq_localNodes == NULL) bgq_localNodes = (int *)malloc(CmiNumNodesGlobal()*sizeof(int)); CmiAssert(bgq_localNodes != NULL); for(int i = 0; i < CmiNumNodesGlobal(); i++) bgq_localNodes[i] = -1; for(int i = 0; i < CmiNumNodes(); i++) { int a, b, c, d, e, t; int global; rankToCoordinates(CmiNodeFirst(i), a, b, c, d, e, t); global = CmiNodeOf(coordinatesToRank(a, b, c, d, e, t)); bgq_localNodes[global] = i; } bgq_isLocalSet = 1; CmiUnlock(bgq_lock); }
static CmiCommHandle MachineSendFuncForLAPI(int destNode, int size, char *msg, int mode) { scompl_hndlr_t *shdlr = NULL; void *sinfo = NULL; if (mode==P2P_SYNC) { shdlr = ReleaseMsg; sinfo = (void *)msg; } else if (mode==P2P_ASYNC) { shdlr = DeliveredMsg; sinfo = malloc(sizeof(int)); *((int *)sinfo) = 1; } CMI_MSG_SIZE(msg) = size; #if ENSURE_MSG_PAIRORDER #if CMK_NODE_QUEUE_AVAILABLE if (CMI_DEST_RANK(msg) == DGRAM_NODEMESSAGE) { lapiSendFn(destNode, size, msg, shdlr, sinfo); return sinfo; } #endif int destPE = CmiNodeFirst(destNode)+CMI_DEST_RANK(msg); CMI_MSG_SRCPE(msg) = CmiMyPe(); /* Note: This could be executed on comm threads, where CmiMyPe() >= CmiNumPes() */ CMI_MSG_SEQNO(msg) = getNextMsgSeqNo(CpvAccess(p2pMsgSeqInfo).nextMsgSeqNo, destPE); setNextMsgSeqNo(CpvAccess(p2pMsgSeqInfo).nextMsgSeqNo, destPE, CMI_MSG_SEQNO(msg)); #endif lapiSendFn(destNode, size, msg, shdlr, sinfo); return sinfo; }
static void SendHyperCube(int size, char *msg, int rankToAssign, int startNode) { #if CMK_BROADCAST_HYPERCUBE int i, cnt, tmp, relDist, oldRank; const int dims=CmiNodesDim; oldRank = CMI_DEST_RANK(msg); /* doing this is to avoid the multiple assignment in the following for loop */ CMI_DEST_RANK(msg) = rankToAssign; /* first send msgs to other nodes */ relDist = CmiMyNode()-startNode; if (relDist < 0) relDist += CmiNumNodes(); /* Sending scheme example: say we have 9 nodes, and the msg is sent from 0 * The overall sending steps will be as follows: * 0-->8, 0-->4, 0-->2, 0-->1 * 4-->6, 4-->5 * 2-->3 * 6-->7 * So for node id as N=A+2^B, it will forward the broadcast (B-1) msg to in * the order as: N+2^(B-1), N+2^(B-2),..., N+1 except node 0, where B is * the first position of bit 1 in the binary format of the number of N * counting from the right with count starting from 0. * On node 0, the value "B" should be CmiNodesDim */ /* Calculate 2^B */ if(relDist==0) cnt = 1<<dims; else cnt = relDist & ((~relDist)+1); /*CmiPrintf("ND[%d]: send bcast msg with cnt=%d\n", CmiMyNode(), cnt);*/ /* Begin to send msgs */ for(cnt>>=1; cnt>0; cnt>>=1){ int nd = relDist + cnt; if (nd >= CmiNumNodes()) continue; nd = (nd+startNode)%CmiNumNodes(); /*CmiPrintf("ND[%d]: send to node %d\n", CmiMyNode(), nd);*/ CmiAssert(nd>=0 && nd!=CmiMyNode()); #if CMK_BROADCAST_USE_CMIREFERENCE CmiReference(msg); CmiSendNetworkFunc(CmiNodeFirst(nd), size, msg, BCAST_SYNC); #else char *newmsg = CopyMsg(msg, size); CmiSendNetworkFunc(CmiNodeFirst(nd), size, newmsg, BCAST_SYNC); #endif } CMI_DEST_RANK(msg) = oldRank; #endif }
/** * This function is used to send other processors on the same node a signal so * they can check if their _initDone can be called: the reason for this is that * the check at the end of _initHandler can fail due to a missing message containing * a Nodegroup creation. When that message arrives only one processor will receive * it, and thus if no notification is sent to the other processors in the node, they * will never proceed. */ static void _sendTriggers(void) { int i, num, first; CmiImmediateLock(CksvAccess(_nodeGroupTableImmLock)); if (CksvAccess(_triggersSent) == 0) { CksvAccess(_triggersSent)++; num = CmiMyNodeSize(); register envelope *env = _allocEnv(RODataMsg); // Notice that the type here is irrelevant env->setSrcPe(CkMyPe()); CmiSetHandler(env, _triggerHandlerIdx); first = CmiNodeFirst(CmiMyNode()); for (i=0; i < num; i++) if(first+i != CkMyPe()) CmiSyncSend(first+i, env->getTotalsize(), (char *)env); CmiFree(env); } CmiImmediateUnlock(CksvAccess(_nodeGroupTableImmLock)); }
void pingpong_moduleinit(void) { int i,j; pvi(int, numRecv); pva(numRecv) = 0; pvi(int, nextIter); pva(nextIter) = -1; pvi(int, nextSize); pva(nextSize) = -1; pvi(int, nextNbr); pva(nextNbr) = -1; pvi(double, starttime); pva(starttime) = 0.0; pvi(double, endtime); pva(endtime) = 0.0; pvi(int, numSizes); for(i=0; sizes[i].size != (-1); i++); pva(numSizes) = i; pvi(double **, times); pva(times) = (double **) malloc(CmiNumNodes()*sizeof(double *)); for(i=0;i<CmiNumNodes();i++) pva(times)[i] = (double *) malloc(pva(numSizes)*sizeof(double)); for(i=0;i<CmiNumNodes();i++) for(j=0;j<pva(numSizes);j++) pva(times)[i][j] = 0.0; pvi(int *, nodeList); pva(nodeList) = (int *) malloc(CmiNumNodes()*sizeof(int)); for(i=0;i<CmiNumNodes();i++) pva(nodeList)[i] = CmiNodeFirst(i); pvi(double *, gavg); pva(gavg) = (double *) malloc(sizeof(double)*pva(numSizes)); pvi(double *, gmax); pva(gmax) = (double *) malloc(sizeof(double)*pva(numSizes)); pvi(double *, gmin); pva(gmin) = (double *) malloc(sizeof(double)*pva(numSizes)); pvi(int *, gmaxSrc); pva(gmaxSrc) = (int *) malloc(sizeof(int)*pva(numSizes)); pvi(int *, gmaxDest); pva(gmaxDest) = (int *) malloc(sizeof(int)*pva(numSizes)); pvi(int *, gminSrc); pva(gminSrc) = (int *) malloc(sizeof(int)*pva(numSizes)); pvi(int *, gminDest); pva(gminDest) = (int *) malloc(sizeof(int)*pva(numSizes)); for(i=0;i<pva(numSizes);i++) { pva(gavg)[i] = 0.0; pva(gmax)[i] = 0.0; pva(gmin)[i] = 1000000000.0; pva(gmaxSrc)[i] = 0; pva(gmaxDest)[i] = 0; pva(gminSrc)[i] = 0; pva(gminDest)[i] = 0; } pvi(int, timeHandler); pva(timeHandler) = CmiRegisterHandler((CmiHandler)recvTime); pvi(int, nodeHandler); pva(nodeHandler) = CmiRegisterHandler((CmiHandler)startNextNode); pvi(int, nbrHandler); pva(nbrHandler) = CmiRegisterHandler((CmiHandler)startNextNbr); pvi(int, sizeHandler); pva(sizeHandler) = CmiRegisterHandler((CmiHandler)startNextSize); pvi(int, iterHandler); pva(iterHandler) = CmiRegisterHandler((CmiHandler)startNextIter); pvi(int, bounceHandler); pva(bounceHandler) = CmiRegisterHandler((CmiHandler)bounceMessage); pvi(int, setupHandler); pva(setupHandler) = CmiRegisterHandler((CmiHandler)setupMessage); pvi(int, startHandler); pva(startHandler) = CmiRegisterHandler((CmiHandler)startMessage); }
void CmiInitCPUAffinity(char **argv) { static skt_ip_t myip; int ret, i, exclude; hostnameMsg *msg; char *pemap = NULL; char *commap = NULL; char *pemapfile = NULL; int show_affinity_flag; int affinity_flag = CmiGetArgFlagDesc(argv,"+setcpuaffinity", "set cpu affinity"); while (CmiGetArgIntDesc(argv,"+excludecore", &exclude, "avoid core when setting cpuaffinity")) { if (CmiMyRank() == 0) add_exclude(exclude); affinity_flag = 1; } if (CmiGetArgStringDesc(argv, "+pemapfile", &pemapfile, "define pe to core mapping file")) { FILE *fp; char buf[128]; pemap = (char*)malloc(1024); fp = fopen(pemapfile, "r"); if (fp == NULL) CmiAbort("pemapfile does not exist"); while (!feof(fp)) { if (fgets(buf, 128, fp)) { if (buf[strlen(buf)-1] == '\n') buf[strlen(buf)-1] = 0; strcat(pemap, buf); } } fclose(fp); if (CmiMyPe()==0) CmiPrintf("Charm++> read from pemap file '%s': %s\n", pemapfile, pemap); } CmiGetArgStringDesc(argv, "+pemap", &pemap, "define pe to core mapping"); if (pemap!=NULL && excludecount>0) CmiAbort("Charm++> +pemap can not be used with +excludecore.\n"); CmiGetArgStringDesc(argv, "+commap", &commap, "define comm threads to core mapping"); if (pemap!=NULL || commap!=NULL) affinity_flag = 1; show_affinity_flag = CmiGetArgFlagDesc(argv,"+showcpuaffinity", "print cpu affinity"); cpuAffinityHandlerIdx = CmiRegisterHandler((CmiHandler)cpuAffinityHandler); cpuAffinityRecvHandlerIdx = CmiRegisterHandler((CmiHandler)cpuAffinityRecvHandler); if (CmiMyRank() ==0) { affLock = CmiCreateLock(); } #if CMK_BLUEGENEP || CMK_BLUEGENEQ if(affinity_flag){ affinity_flag = 0; if(CmiMyPe()==0) CmiPrintf("Charm++> cpu affinity setting is not needed on Blue Gene, thus ignored.\n"); } if(show_affinity_flag){ show_affinity_flag = 0; if(CmiMyPe()==0) CmiPrintf("Charm++> printing cpu affinity is not supported on Blue Gene.\n"); } #endif if (!affinity_flag) { if (show_affinity_flag) CmiPrintCPUAffinity(); return; } if (CmiMyPe() == 0) { CmiPrintf("Charm++> cpu affinity enabled. \n"); if (excludecount > 0) { CmiPrintf("Charm++> cpuaffinity excludes core: %d", excludecore[0]); for (i=1; i<excludecount; i++) CmiPrintf(" %d", excludecore[i]); CmiPrintf(".\n"); } if (pemap!=NULL) CmiPrintf("Charm++> cpuaffinity PE-core map : %s\n", pemap); } if (CmiMyPe() >= CmiNumPes()) { /* this is comm thread */ /* comm thread either can float around, or pin down to the last rank. however it seems to be reportedly slower if it is floating */ CmiNodeAllBarrier(); if (commap != NULL) { int mycore = search_pemap(commap, CmiMyPeGlobal()-CmiNumPesGlobal()); if(CmiMyPe()-CmiNumPes()==0) printf("Charm++> set comm %d on node %d to core #%d\n", CmiMyPe()-CmiNumPes(), CmiMyNode(), mycore); if (-1 == CmiSetCPUAffinity(mycore)) CmiAbort("set_cpu_affinity abort!"); CmiNodeAllBarrier(); if (show_affinity_flag) CmiPrintCPUAffinity(); return; /* comm thread return */ } else { /* if (CmiSetCPUAffinity(CmiNumCores()-1) == -1) CmiAbort("set_cpu_affinity abort!"); */ #if !CMK_CRAYXT && !CMK_CRAYXE && !CMK_CRAYXC && !CMK_BLUEGENEQ if (pemap == NULL) { #if CMK_MACHINE_PROGRESS_DEFINED while (affinity_doneflag < CmiMyNodeSize()) CmiNetworkProgress(); #else #if CMK_SMP #error "Machine progress call needs to be implemented for cpu affinity!" #endif #endif } #endif #if CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC /* if both pemap and commmap are NULL, will compute one */ if (pemap != NULL) #endif { CmiNodeAllBarrier(); if (show_affinity_flag) CmiPrintCPUAffinity(); return; /* comm thread return */ } } } if (pemap != NULL && CmiMyPe()<CmiNumPes()) { /* work thread */ int mycore = search_pemap(pemap, CmiMyPeGlobal()); if(show_affinity_flag) CmiPrintf("Charm++> set PE %d on node %d to core #%d\n", CmiMyPe(), CmiMyNode(), mycore); if (mycore >= CmiNumCores()) { CmiPrintf("Error> Invalid core number %d, only have %d cores (0-%d) on the node. \n", mycore, CmiNumCores(), CmiNumCores()-1); CmiAbort("Invalid core number"); } if (CmiSetCPUAffinity(mycore) == -1) CmiAbort("set_cpu_affinity abort!"); CmiNodeAllBarrier(); CmiNodeAllBarrier(); /* if (show_affinity_flag) CmiPrintCPUAffinity(); */ return; } #if CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC { int numCores = CmiNumCores(); int myid = getXTNodeID(CmiMyNodeGlobal(), CmiNumNodesGlobal()); int myrank; int pe, mype = CmiMyPeGlobal(); int node = CmiMyNodeGlobal(); int nnodes = 0; #if CMK_SMP if (CmiMyPe() >= CmiNumPes()) { /* this is comm thread */ int node = CmiMyPe() - CmiNumPes(); mype = CmiGetPeGlobal(CmiNodeFirst(node) + CmiMyNodeSize() - 1, CmiMyPartition()); /* last pe on SMP node */ node = CmiGetNodeGlobal(node, CmiMyPartition()); } #endif pe = mype - 1; while (pe >= 0) { int n = CmiNodeOf(pe); if (n != node) { nnodes++; node = n; } if (getXTNodeID(n, CmiNumNodesGlobal()) != myid) break; pe --; } CmiAssert(numCores > 0); myrank = (mype - pe - 1 + nnodes)%numCores; #if CMK_SMP if (CmiMyPe() >= CmiNumPes()) myrank = (myrank + 1)%numCores; #endif if (-1 != CmiSetCPUAffinity(myrank)) { DEBUGP(("Processor %d is bound to core #%d on node #%d\n", CmiMyPe(), myrank, mynode)); } else{ CmiPrintf("Processor %d set affinity failed!\n", CmiMyPe()); CmiAbort("set cpu affinity abort!\n"); } } if (CmiMyPe() < CmiNumPes()) CmiNodeAllBarrier(); CmiNodeAllBarrier(); #else /* get my ip address */ if (CmiMyRank() == 0) { #if CMK_HAS_GETHOSTNAME myip = skt_my_ip(); /* not thread safe, so only calls on rank 0 */ #else CmiAbort("Can not get unique name for the compute nodes. \n"); #endif } CmiNodeAllBarrier(); /* prepare a msg to send */ msg = (hostnameMsg *)CmiAlloc(sizeof(hostnameMsg)); CmiSetHandler((char *)msg, cpuAffinityHandlerIdx); msg->pe = CmiMyPe(); msg->ip = myip; msg->ncores = CmiNumCores(); DEBUGP(("PE %d's node has %d number of cores. \n", CmiMyPe(), msg->ncores)); msg->rank = 0; CmiSyncSendAndFree(0, sizeof(hostnameMsg), (void *)msg); if (CmiMyPe() == 0) { int i; hostTable = CmmNew(); rankmsg = (rankMsg *)CmiAlloc(sizeof(rankMsg)+CmiNumPes()*sizeof(int)*2); CmiSetHandler((char *)rankmsg, cpuAffinityRecvHandlerIdx); rankmsg->ranks = (int *)((char*)rankmsg + sizeof(rankMsg)); rankmsg->nodes = (int *)((char*)rankmsg + sizeof(rankMsg) + CmiNumPes()*sizeof(int)); for (i=0; i<CmiNumPes(); i++) { rankmsg->ranks[i] = 0; rankmsg->nodes[i] = -1; } for (i=0; i<CmiNumPes(); i++) CmiDeliverSpecificMsg(cpuAffinityHandlerIdx); } /* receive broadcast from PE 0 */ CmiDeliverSpecificMsg(cpuAffinityRecvHandlerIdx); CmiLock(affLock); affinity_doneflag++; CmiUnlock(affLock); CmiNodeAllBarrier(); #endif if (show_affinity_flag) CmiPrintCPUAffinity(); }
void TorusLB::strategy() { int index; // compute the average load by (compute load + background load) / numPesAvailable computeAverage(); // two heaps of self and pair computes makeTwoHeaps(); const int beginGroup = processors[0].Id; const int endGroup = beginGroup + P; #define INGROUP(PROC) ((PROC) >= beginGroup && (PROC) < endGroup) computeInfo *c; processorInfo *p, *minp; Iterator nextP; overLoad = 1.2; for(int I=0; I<numComputes; I++) { c = (computeInfo *) computePairHeap->deleteMax(); if ( ! c ) c = (computeInfo *) computeSelfHeap->deleteMax(); if(c->processor != -1) continue; // go to the next compute if(!c) CkAbort("TorusLB: Compute Heap empty!\n"); for(int j=0; j<6; j++) { bestPe[j] = 0; goodPe[j] = 0; badPe[j] = 0; } // Look at pes which have the compute's patches // HYBRID check if processor is in local group #define SELECT_REALPE(X) if INGROUP((X)) { \ selectPes(&processors[(X) - beginGroup], c); \ } const int realPe1 = patches[c->patch1].processor; SELECT_REALPE(realPe1) const int realPe2 = patches[c->patch2].processor; if ( realPe2 != realPe1 ) { SELECT_REALPE(realPe2) } // Try the processors which have the patches' proxies p = (processorInfo *)(patches[c->patch1].proxiesOn.iterator((Iterator *)&nextP)); while(p) { // patch 1 if INGROUP(p->Id) selectPes(p, c); p = (processorInfo *)(patches[c->patch1].proxiesOn.next((Iterator *)&nextP)); } p = (processorInfo *)(patches[c->patch2].proxiesOn.iterator((Iterator *)&nextP)); while(p) { // patch 2 if INGROUP(p->Id) selectPes(p, c); p = (processorInfo *)(patches[c->patch2].proxiesOn.next((Iterator *)&nextP)); } // see if we have found a processor to place the compute on p = 0; if((p = bestPe[5]) #if USE_TOPOMAP || (p = goodPe[5]) #endif || (p = bestPe[4]) #if USE_TOPOMAP || (p = goodPe[4]) #endif || (p = bestPe[3]) #if USE_TOPOMAP || (p = goodPe[3]) #endif || (p = bestPe[1]) #if USE_TOPOMAP || (p = goodPe[1]) #endif || (p = bestPe[2]) #if USE_TOPOMAP || (p = goodPe[2]) #endif || (p = bestPe[0]) #if USE_TOPOMAP || (p = goodPe[0]) #endif ) { assign(c, p); continue; } // Try all pes on the nodes of the home patches if ( CmiNumNodes() > 1 ) { // else not useful double minLoad = overLoad * averageLoad; minp = 0; int realNode1 = CmiNodeOf(realPe1); int nodeSize = CmiNodeSize(realNode1); if ( nodeSize > 1 ) { // else did it already int firstpe = CmiNodeFirst(realNode1); for ( int rpe = firstpe; rpe < firstpe+nodeSize; ++rpe ) { if INGROUP(rpe) { p = &processors[rpe - beginGroup]; if ( p->available && ( p->load + c->load < minLoad ) ) { minLoad = p->load + c->load; minp = p; } } } } if ( realPe2 != realPe1 ) { int realNode2 = CmiNodeOf(realPe2); if ( realNode2 != realNode1 ) { // else did it already nodeSize = CmiNodeSize(realNode2); if ( nodeSize > 1 ) { int firstpe = CmiNodeFirst(realNode2); for ( int rpe = firstpe; rpe < firstpe+nodeSize; ++rpe ) { if INGROUP(rpe) { p = &processors[rpe - beginGroup]; if ( p->available && ( p->load + c->load < minLoad ) ) { minLoad = p->load + c->load; minp = p; } } } }