CmiCommHandle CmiAsyncBroadcastFn(int size, char *msg) { #if ENSURE_MSG_PAIRORDER /* Not sure how to add the msg seq no for async broadcast messages --Chao Mei */ /* so abort here ! */ CmiAssert(0); return 0; #else int i, rank; int mype = CmiMyPe(); #if ENABLE_CONVERSE_QD CQdCreate(CpvAccess(cQdState), CmiNumPes()-1); #endif MACHSTATE1(3,"[%d] Sending async broadcast message from {",CmiMyNode()); CMI_BROADCAST_ROOT(msg) = 0; void *handle = malloc(sizeof(int)); *((int *)handle) = CmiNumPes()-1; for (i=mype+1; i<CmiNumPes(); i++) { CMI_DEST_RANK(msg) = CmiRankOf(i); lapiSendFn(CmiNodeOf(i), size, msg, DeliveredMsg, handle); } for (i=0; i<mype; i++) { CMI_DEST_RANK(msg) = CmiRankOf(i); lapiSendFn(CmiNodeOf(i), size, msg, DeliveredMsg, handle); } MACHSTATE(3,"} Sending async broadcast message end"); return handle; #endif }
void CldEnqueue(int pe, void *msg, int infofn) { int len, queueing, priobits; unsigned int *prioptr; CldInfoFn ifn; CldPackFn pfn; peinfo *pinf = &(CpvAccess(peinf)); ifn = (CldInfoFn)CmiHandlerToFunction(infofn); ifn(msg, &pfn, &len, &queueing, &priobits, &prioptr); if (pe != CLD_ANYWHERE) { if (pfn && (CmiNodeOf(pe) != CmiMyNode())) { pfn(&msg); ifn(msg, &pfn, &len, &queueing, &priobits, &prioptr); } CmiSetInfo(msg, infofn); CmiSetXHandler(msg, CmiGetHandler(msg)); CmiSetHandler(msg, pinf->EnqueueHandler); if (pe==CLD_BROADCAST) CmiSyncBroadcastAndFree(len, msg); else if (pe==CLD_BROADCAST_ALL) CmiSyncBroadcastAllAndFree(len, msg); else CmiSyncSendAndFree(pe, len, msg); } else { CmiSetInfo(msg, infofn); CmiSetXHandler(msg, CmiGetHandler(msg)); CmiSetHandler(msg, pinf->HopHandler); CsdEnqueueGeneral(msg, queueing, priobits, prioptr); } }
void BGQTorusManager::populateLocalNodes() { if(CmiNumPartitions() == 1) return; CmiLock(bgq_lock); if(bgq_isLocalSet) { CmiUnlock(bgq_lock); return; } if(bgq_localNodes == NULL) bgq_localNodes = (int *)malloc(CmiNumNodesGlobal()*sizeof(int)); CmiAssert(bgq_localNodes != NULL); for(int i = 0; i < CmiNumNodesGlobal(); i++) bgq_localNodes[i] = -1; for(int i = 0; i < CmiNumNodes(); i++) { int a, b, c, d, e, t; int global; rankToCoordinates(CmiNodeFirst(i), a, b, c, d, e, t); global = CmiNodeOf(coordinatesToRank(a, b, c, d, e, t)); bgq_localNodes[global] = i; } bgq_isLocalSet = 1; CmiUnlock(bgq_lock); }
/** \function pidtonid * finds nids for pids 1 to CmiNumPes and stores them in an array * correspondingly also creates an array for nids to pids */ void pidtonid(int numpes) { CmiLock(cray_lock); if (pid2nid != NULL) { CmiUnlock(cray_lock); return; /* did once already */ } getDimension(&maxNID,&maxX,&maxY,&maxZ); int numCores = CmiNumCores(); pid2nid = (int *)malloc(sizeof(int) * numpes); #if XT4_TOPOLOGY || XT5_TOPOLOGY || XE6_TOPOLOGY int i, nid, ret; CmiAssert(rca_coords == NULL); rca_coords = (rca_mesh_coord_t *)malloc(sizeof(rca_mesh_coord_t)*(maxNID+1)); for (i=0; i<maxNID; i++) { rca_coords[i].mesh_x = rca_coords[i].mesh_y = rca_coords[i].mesh_z = -1; } for (i=0; i<numpes; i++) { PMI_Get_nid(CmiGetNodeGlobal(CmiNodeOf(i),CmiMyPartition()), &nid); pid2nid[i] = nid; CmiAssert(nid < maxNID); ret = rca_get_meshcoord(nid, &rca_coords[nid]); CmiAssert(ret != -1); } #endif CmiUnlock(cray_lock); }
/* send msg along the hypercube in broadcast. (Sameer) */ static void SendHyperCubeProc(int size, char *msg) { int startpe = CMI_BROADCAST_ROOT(msg)-1; int startnode = CmiNodeOf(startpe); #if CMK_SMP if (startpe > CmiNumPes()) startnode = startpe - CmiNumPes(); #endif SendHyperCube(size, msg, 0, startnode); #if CMK_SMP /* second send msgs to my peers on this node */ SendToPeers(size, msg); #endif }
void CmiDirect_manytomany_initialize_send ( void * h, unsigned tag, unsigned idx, unsigned displ, unsigned bytes, unsigned rank ) { BGPCmiDirectM2mHandle *handle = (BGPCmiDirectM2mHandle *) h; assert ( tag < MAX_CONN ); handle->m2m_sndlens [tag][idx] = bytes; handle->m2m_sdispls [tag][idx] = displ; handle->m2m_ranks [tag][idx] = CmiGetNodeGlobal(CmiNodeOf(rank),CmiMyPartition()); handle->m2m_permutation[tag][idx] = (idx+1)%handle->m2m_nsndranks[tag]; }
void CldHopHandler(char *msg) { peinfo *pinf = &(CpvAccess(peinf)); int len, queueing, priobits; unsigned int *prioptr; CldInfoFn ifn; CldPackFn pfn; int pe; if (pinf->rebalance) { /* do pe = ((lrand48()&0x7FFFFFFF)%CmiNumPes()); */ do pe = ((CrnRand()&0x7FFFFFFF)%CmiNumPes()); while (pe == pinf->mype); ifn = (CldInfoFn)CmiHandlerToFunction(CmiGetInfo(msg)); ifn(msg, &pfn, &len, &queueing, &priobits, &prioptr); if (pfn && CmiNodeOf(pe) != CmiMyNode()) { pfn(&msg); ifn(msg, &pfn, &len, &queueing, &priobits, &prioptr); } CmiSyncSendAndFree(pe, len, msg); pinf->rebalance--; } else { CmiSetHandler(msg, CmiGetXHandler(msg)); CmiHandleMessage(msg); } }
void CmiInitCPUAffinity(char **argv) { static skt_ip_t myip; int ret, i, exclude; hostnameMsg *msg; char *pemap = NULL; char *commap = NULL; char *pemapfile = NULL; int show_affinity_flag; int affinity_flag = CmiGetArgFlagDesc(argv,"+setcpuaffinity", "set cpu affinity"); while (CmiGetArgIntDesc(argv,"+excludecore", &exclude, "avoid core when setting cpuaffinity")) { if (CmiMyRank() == 0) add_exclude(exclude); affinity_flag = 1; } if (CmiGetArgStringDesc(argv, "+pemapfile", &pemapfile, "define pe to core mapping file")) { FILE *fp; char buf[128]; pemap = (char*)malloc(1024); fp = fopen(pemapfile, "r"); if (fp == NULL) CmiAbort("pemapfile does not exist"); while (!feof(fp)) { if (fgets(buf, 128, fp)) { if (buf[strlen(buf)-1] == '\n') buf[strlen(buf)-1] = 0; strcat(pemap, buf); } } fclose(fp); if (CmiMyPe()==0) CmiPrintf("Charm++> read from pemap file '%s': %s\n", pemapfile, pemap); } CmiGetArgStringDesc(argv, "+pemap", &pemap, "define pe to core mapping"); if (pemap!=NULL && excludecount>0) CmiAbort("Charm++> +pemap can not be used with +excludecore.\n"); CmiGetArgStringDesc(argv, "+commap", &commap, "define comm threads to core mapping"); if (pemap!=NULL || commap!=NULL) affinity_flag = 1; show_affinity_flag = CmiGetArgFlagDesc(argv,"+showcpuaffinity", "print cpu affinity"); cpuAffinityHandlerIdx = CmiRegisterHandler((CmiHandler)cpuAffinityHandler); cpuAffinityRecvHandlerIdx = CmiRegisterHandler((CmiHandler)cpuAffinityRecvHandler); if (CmiMyRank() ==0) { affLock = CmiCreateLock(); } #if CMK_BLUEGENEP || CMK_BLUEGENEQ if(affinity_flag){ affinity_flag = 0; if(CmiMyPe()==0) CmiPrintf("Charm++> cpu affinity setting is not needed on Blue Gene, thus ignored.\n"); } if(show_affinity_flag){ show_affinity_flag = 0; if(CmiMyPe()==0) CmiPrintf("Charm++> printing cpu affinity is not supported on Blue Gene.\n"); } #endif if (!affinity_flag) { if (show_affinity_flag) CmiPrintCPUAffinity(); return; } if (CmiMyPe() == 0) { CmiPrintf("Charm++> cpu affinity enabled. \n"); if (excludecount > 0) { CmiPrintf("Charm++> cpuaffinity excludes core: %d", excludecore[0]); for (i=1; i<excludecount; i++) CmiPrintf(" %d", excludecore[i]); CmiPrintf(".\n"); } if (pemap!=NULL) CmiPrintf("Charm++> cpuaffinity PE-core map : %s\n", pemap); } if (CmiMyPe() >= CmiNumPes()) { /* this is comm thread */ /* comm thread either can float around, or pin down to the last rank. however it seems to be reportedly slower if it is floating */ CmiNodeAllBarrier(); if (commap != NULL) { int mycore = search_pemap(commap, CmiMyPeGlobal()-CmiNumPesGlobal()); if(CmiMyPe()-CmiNumPes()==0) printf("Charm++> set comm %d on node %d to core #%d\n", CmiMyPe()-CmiNumPes(), CmiMyNode(), mycore); if (-1 == CmiSetCPUAffinity(mycore)) CmiAbort("set_cpu_affinity abort!"); CmiNodeAllBarrier(); if (show_affinity_flag) CmiPrintCPUAffinity(); return; /* comm thread return */ } else { /* if (CmiSetCPUAffinity(CmiNumCores()-1) == -1) CmiAbort("set_cpu_affinity abort!"); */ #if !CMK_CRAYXT && !CMK_CRAYXE && !CMK_CRAYXC && !CMK_BLUEGENEQ if (pemap == NULL) { #if CMK_MACHINE_PROGRESS_DEFINED while (affinity_doneflag < CmiMyNodeSize()) CmiNetworkProgress(); #else #if CMK_SMP #error "Machine progress call needs to be implemented for cpu affinity!" #endif #endif } #endif #if CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC /* if both pemap and commmap are NULL, will compute one */ if (pemap != NULL) #endif { CmiNodeAllBarrier(); if (show_affinity_flag) CmiPrintCPUAffinity(); return; /* comm thread return */ } } } if (pemap != NULL && CmiMyPe()<CmiNumPes()) { /* work thread */ int mycore = search_pemap(pemap, CmiMyPeGlobal()); if(show_affinity_flag) CmiPrintf("Charm++> set PE %d on node %d to core #%d\n", CmiMyPe(), CmiMyNode(), mycore); if (mycore >= CmiNumCores()) { CmiPrintf("Error> Invalid core number %d, only have %d cores (0-%d) on the node. \n", mycore, CmiNumCores(), CmiNumCores()-1); CmiAbort("Invalid core number"); } if (CmiSetCPUAffinity(mycore) == -1) CmiAbort("set_cpu_affinity abort!"); CmiNodeAllBarrier(); CmiNodeAllBarrier(); /* if (show_affinity_flag) CmiPrintCPUAffinity(); */ return; } #if CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC { int numCores = CmiNumCores(); int myid = getXTNodeID(CmiMyNodeGlobal(), CmiNumNodesGlobal()); int myrank; int pe, mype = CmiMyPeGlobal(); int node = CmiMyNodeGlobal(); int nnodes = 0; #if CMK_SMP if (CmiMyPe() >= CmiNumPes()) { /* this is comm thread */ int node = CmiMyPe() - CmiNumPes(); mype = CmiGetPeGlobal(CmiNodeFirst(node) + CmiMyNodeSize() - 1, CmiMyPartition()); /* last pe on SMP node */ node = CmiGetNodeGlobal(node, CmiMyPartition()); } #endif pe = mype - 1; while (pe >= 0) { int n = CmiNodeOf(pe); if (n != node) { nnodes++; node = n; } if (getXTNodeID(n, CmiNumNodesGlobal()) != myid) break; pe --; } CmiAssert(numCores > 0); myrank = (mype - pe - 1 + nnodes)%numCores; #if CMK_SMP if (CmiMyPe() >= CmiNumPes()) myrank = (myrank + 1)%numCores; #endif if (-1 != CmiSetCPUAffinity(myrank)) { DEBUGP(("Processor %d is bound to core #%d on node #%d\n", CmiMyPe(), myrank, mynode)); } else{ CmiPrintf("Processor %d set affinity failed!\n", CmiMyPe()); CmiAbort("set cpu affinity abort!\n"); } } if (CmiMyPe() < CmiNumPes()) CmiNodeAllBarrier(); CmiNodeAllBarrier(); #else /* get my ip address */ if (CmiMyRank() == 0) { #if CMK_HAS_GETHOSTNAME myip = skt_my_ip(); /* not thread safe, so only calls on rank 0 */ #else CmiAbort("Can not get unique name for the compute nodes. \n"); #endif } CmiNodeAllBarrier(); /* prepare a msg to send */ msg = (hostnameMsg *)CmiAlloc(sizeof(hostnameMsg)); CmiSetHandler((char *)msg, cpuAffinityHandlerIdx); msg->pe = CmiMyPe(); msg->ip = myip; msg->ncores = CmiNumCores(); DEBUGP(("PE %d's node has %d number of cores. \n", CmiMyPe(), msg->ncores)); msg->rank = 0; CmiSyncSendAndFree(0, sizeof(hostnameMsg), (void *)msg); if (CmiMyPe() == 0) { int i; hostTable = CmmNew(); rankmsg = (rankMsg *)CmiAlloc(sizeof(rankMsg)+CmiNumPes()*sizeof(int)*2); CmiSetHandler((char *)rankmsg, cpuAffinityRecvHandlerIdx); rankmsg->ranks = (int *)((char*)rankmsg + sizeof(rankMsg)); rankmsg->nodes = (int *)((char*)rankmsg + sizeof(rankMsg) + CmiNumPes()*sizeof(int)); for (i=0; i<CmiNumPes(); i++) { rankmsg->ranks[i] = 0; rankmsg->nodes[i] = -1; } for (i=0; i<CmiNumPes(); i++) CmiDeliverSpecificMsg(cpuAffinityHandlerIdx); } /* receive broadcast from PE 0 */ CmiDeliverSpecificMsg(cpuAffinityRecvHandlerIdx); CmiLock(affLock); affinity_doneflag++; CmiUnlock(affLock); CmiNodeAllBarrier(); #endif if (show_affinity_flag) CmiPrintCPUAffinity(); }
void TorusLB::strategy() { int index; // compute the average load by (compute load + background load) / numPesAvailable computeAverage(); // two heaps of self and pair computes makeTwoHeaps(); const int beginGroup = processors[0].Id; const int endGroup = beginGroup + P; #define INGROUP(PROC) ((PROC) >= beginGroup && (PROC) < endGroup) computeInfo *c; processorInfo *p, *minp; Iterator nextP; overLoad = 1.2; for(int I=0; I<numComputes; I++) { c = (computeInfo *) computePairHeap->deleteMax(); if ( ! c ) c = (computeInfo *) computeSelfHeap->deleteMax(); if(c->processor != -1) continue; // go to the next compute if(!c) CkAbort("TorusLB: Compute Heap empty!\n"); for(int j=0; j<6; j++) { bestPe[j] = 0; goodPe[j] = 0; badPe[j] = 0; } // Look at pes which have the compute's patches // HYBRID check if processor is in local group #define SELECT_REALPE(X) if INGROUP((X)) { \ selectPes(&processors[(X) - beginGroup], c); \ } const int realPe1 = patches[c->patch1].processor; SELECT_REALPE(realPe1) const int realPe2 = patches[c->patch2].processor; if ( realPe2 != realPe1 ) { SELECT_REALPE(realPe2) } // Try the processors which have the patches' proxies p = (processorInfo *)(patches[c->patch1].proxiesOn.iterator((Iterator *)&nextP)); while(p) { // patch 1 if INGROUP(p->Id) selectPes(p, c); p = (processorInfo *)(patches[c->patch1].proxiesOn.next((Iterator *)&nextP)); } p = (processorInfo *)(patches[c->patch2].proxiesOn.iterator((Iterator *)&nextP)); while(p) { // patch 2 if INGROUP(p->Id) selectPes(p, c); p = (processorInfo *)(patches[c->patch2].proxiesOn.next((Iterator *)&nextP)); } // see if we have found a processor to place the compute on p = 0; if((p = bestPe[5]) #if USE_TOPOMAP || (p = goodPe[5]) #endif || (p = bestPe[4]) #if USE_TOPOMAP || (p = goodPe[4]) #endif || (p = bestPe[3]) #if USE_TOPOMAP || (p = goodPe[3]) #endif || (p = bestPe[1]) #if USE_TOPOMAP || (p = goodPe[1]) #endif || (p = bestPe[2]) #if USE_TOPOMAP || (p = goodPe[2]) #endif || (p = bestPe[0]) #if USE_TOPOMAP || (p = goodPe[0]) #endif ) { assign(c, p); continue; } // Try all pes on the nodes of the home patches if ( CmiNumNodes() > 1 ) { // else not useful double minLoad = overLoad * averageLoad; minp = 0; int realNode1 = CmiNodeOf(realPe1); int nodeSize = CmiNodeSize(realNode1); if ( nodeSize > 1 ) { // else did it already int firstpe = CmiNodeFirst(realNode1); for ( int rpe = firstpe; rpe < firstpe+nodeSize; ++rpe ) { if INGROUP(rpe) { p = &processors[rpe - beginGroup]; if ( p->available && ( p->load + c->load < minLoad ) ) { minLoad = p->load + c->load; minp = p; } } } } if ( realPe2 != realPe1 ) { int realNode2 = CmiNodeOf(realPe2); if ( realNode2 != realNode1 ) { // else did it already nodeSize = CmiNodeSize(realNode2); if ( nodeSize > 1 ) { int firstpe = CmiNodeFirst(realNode2); for ( int rpe = firstpe; rpe < firstpe+nodeSize; ++rpe ) { if INGROUP(rpe) { p = &processors[rpe - beginGroup]; if ( p->available && ( p->load + c->load < minLoad ) ) { minLoad = p->load + c->load; minp = p; } } } }