/****************** * Initialization routine * currently just testing start up * ****************/ void CmiInitSysvshm(char **argv){ MACHSTATE(3,"CminitSysvshm start"); sysvshmContext = (SysvshmContext *)malloc(sizeof(SysvshmContext)); if(Cmi_charmrun_pid <= 0){ CmiAbort("sysvshm must be run with charmrun"); } calculateNodeSizeAndRank(argv); if(sysvshmContext->nodesize == 1){ return; } MACHSTATE1(3,"CminitSysvshm %d calculateNodeSizeAndRank",sysvshmContext->nodesize); setupSharedBuffers(); MACHSTATE2(3,"CminitSysvshm %d %d setupSharedBuffers",Cmi_charmrun_pid,sysvshmContext->nodesize); initAllSendQs(); MACHSTATE2(3,"CminitSysvshm %d %d initAllSendQs",Cmi_charmrun_pid,sysvshmContext->nodesize); MACHSTATE2(3,"CminitSysvshm %d %d done",Cmi_charmrun_pid,sysvshmContext->nodesize); #if SYSVSHM_STATS sysvshmContext->sendCount=0; sysvshmContext->sendTime=0.0; sysvshmContext->validCheckCount=0; sysvshmContext->validCheckTime=0.0; sysvshmContext->commServerTime = 0; sysvshmContext->lockRecvCount = 0; #endif };
static INLINE_KEYWORD void lapiSendFn(int destNode, int size, char *msg, scompl_hndlr_t *shdlr, void *sinfo) { lapi_xfer_t xfer_cmd; MACHSTATE3(2,"lapiSendFn to destNode=%d with msg %p (isImm=%d) begin {",destNode,msg, CmiIsImmediate(msg)); MACHSTATE3(2, "inside lapiSendFn 1: size=%d, sinfo=%p, deliverable=%d", size, sinfo, deliverable); MACHSTATE2(2, "Ready to call LAPI_Xfer with destNode=%d, destRank=%d",destNode,CMI_DEST_RANK(msg)); xfer_cmd.Am.Xfer_type = LAPI_AM_XFER; xfer_cmd.Am.flags = 0; xfer_cmd.Am.tgt = destNode; xfer_cmd.Am.hdr_hdl = lapiHeaderHandler; xfer_cmd.Am.uhdr_len = 0; xfer_cmd.Am.uhdr = NULL; xfer_cmd.Am.udata = msg; xfer_cmd.Am.udata_len = size; xfer_cmd.Am.shdlr = shdlr; xfer_cmd.Am.sinfo = sinfo; xfer_cmd.Am.tgt_cntr = NULL; xfer_cmd.Am.org_cntr = NULL; xfer_cmd.Am.cmpl_cntr = NULL; check_lapi(LAPI_Xfer,(lapiContext, &xfer_cmd)); MACHSTATE(2,"} lapiSendFn end"); }
/* initnode node table reply format: +------------------------------------------------------- | 4 bytes | Number of nodes n ^ | | (big-endian binary integer) 4+12*n bytes +------------------------------------------------- | ^ | (one entry for each node) ^ | | | 4 bytes | Number of PEs for this node | | n | 4 bytes | IP address of this node 12*n bytes | | | 4 bytes | Data (UDP) port of this node | | v | | (big-endian binary integers) v v ---+---------------------------------------------------- */ static void node_addresses_store(ChMessage *msg) { ChMessageInt_t *n32=(ChMessageInt_t *)msg->data; ChNodeinfo *d=(ChNodeinfo *)(n32+1); int nodestart; int i,j,n; MACHSTATE(1,"node_addresses_store {"); _Cmi_numnodes=ChMessageInt(n32[0]); if ((sizeof(ChMessageInt_t)+sizeof(ChNodeinfo)*_Cmi_numnodes) !=(unsigned int)msg->len) {printf("Node table has inconsistent length!");machine_exit(1);} nodes = (OtherNode)malloc(_Cmi_numnodes * sizeof(struct OtherNodeStruct)); nodestart=0; for (i=0; i<_Cmi_numnodes; i++) { nodes[i].nodestart = nodestart; nodes[i].nodesize = ChMessageInt(d[i].nPE); MACHSTATE2(3,"node %d nodesize %d",i,nodes[i].nodesize); nodes[i].mach_id = ChMessageInt(d[i].mach_id); nodes[i].IP=d[i].IP; if (i==_Cmi_mynode) { Cmi_nodestart=nodes[i].nodestart; _Cmi_mynodesize=nodes[i].nodesize; Cmi_self_IP=nodes[i].IP; } nodes[i].dataport = ChMessageInt(d[i].dataport); nodes[i].addr = skt_build_addr(nodes[i].IP,nodes[i].dataport); #if CMK_USE_TCP nodes[i].sock = INVALID_SOCKET; #endif nodestart+=nodes[i].nodesize; } _Cmi_numpes=nodestart; n = _Cmi_numpes; #ifdef CMK_CPV_IS_SMP n += _Cmi_numnodes; #endif nodes_by_pe = (OtherNode*)malloc(n * sizeof(OtherNode)); _MEMCHECK(nodes_by_pe); for (i=0; i<_Cmi_numnodes; i++) { OtherNode node = nodes + i; OtherNode_init(node); for (j=0; j<node->nodesize; j++) { nodes_by_pe[j + node->nodestart] = node; } } #ifdef CMK_CPV_IS_SMP /* index for communication threads */ for (i=_Cmi_numpes; i<_Cmi_numpes+_Cmi_numnodes; i++) { OtherNode node = nodes + i-_Cmi_numpes; nodes_by_pe[i] = node; } #endif MACHSTATE(1,"} node_addresses_store"); }
void GarbageCollectMsg(OutgoingMsg ogm) { MACHSTATE2(3,"GarbageCollectMsg called on ogm %p refcount %d",ogm,ogm->refcount); if (ogm->refcount == 0) { if (ogm->freemode == 'A') { ogm->freemode = 'X'; } else { if (ogm->freemode != 'G') CmiFree(ogm->data); FreeOutgoingMsg(ogm); } } }
static void processBcastQs() { #if CMK_OFFLOAD_BCAST_PROCESS char *msg; do { msg = CMIQueuePop(CsvAccess(procBcastQ)); if (!msg) break; MACHSTATE2(4, "[%d]: process a proc-level bcast msg %p begin{", CmiMyNode(), msg); processProcBcastMsg(CMI_MSG_SIZE(msg), msg); MACHSTATE2(4, "[%d]: process a proc-level bcast msg %p end}", CmiMyNode(), msg); } while (1); #if CMK_NODE_QUEUE_AVAILABLE do { msg = CMIQueuePop(CsvAccess(nodeBcastQ)); if (!msg) break; MACHSTATE2(4, "[%d]: process a node-level bcast msg %p begin{", CmiMyNode(), msg); processNodeBcastMsg(CMI_MSG_SIZE(msg), msg); MACHSTATE2(4, "[%d]: process a node-level bcast msg %p end}", CmiMyNode(), msg); } while (1); #endif #endif }
/*************** * calculate the name of the shared objects and semaphores * * name scheme * shared memory: charm_pxshm_<recvernoderank>_<sendernoderank> * semaphore : charm_pxshm_<recvernoderank>_<sendernoderank>.sem for semaphore for that shared object * the semaphore name used by us is the same as the shared memory object name * the posix library adds the semaphore tag // in linux at least . other machines might need more portable code * * open these shared objects and semaphores * *********/ void setupSharedBuffers(){ int i=0; allocBufNameStrings(&(pxshmContext->recvBufNames)); MACHSTATE(3,"allocBufNameStrings for recvBufNames done"); MEMDEBUG(CmiMemoryCheck()); allocBufNameStrings((&pxshmContext->sendBufNames)); MACHSTATE(3,"allocBufNameStrings for sendBufNames done"); for(i=0;i<pxshmContext->nodesize;i++){ if(i != pxshmContext->noderank){ snprintf(pxshmContext->recvBufNames[i],NAMESTRLEN-1,"%s_%d_%d",pxshmContext->prefixStr,pxshmContext->noderank+pxshmContext->nodestart,i+pxshmContext->nodestart); MACHSTATE2(3,"recvBufName %s with rank %d",pxshmContext->recvBufNames[i],i) snprintf(pxshmContext->sendBufNames[i],NAMESTRLEN-1,"%s_%d_%d",pxshmContext->prefixStr,i+pxshmContext->nodestart,pxshmContext->noderank+pxshmContext->nodestart); MACHSTATE2(3,"sendBufName %s with rank %d",pxshmContext->sendBufNames[i],i); } } createShmObjectsAndSems(&(pxshmContext->recvBufs),pxshmContext->recvBufNames); createShmObjectsAndSems(&(pxshmContext->sendBufs),pxshmContext->sendBufNames); for(i=0;i<pxshmContext->nodesize;i++){ if(i != pxshmContext->noderank){ //CmiAssert(pxshmContext->sendBufs[i].header->count == 0); pxshmContext->sendBufs[i].header->count = 0; pxshmContext->sendBufs[i].header->bytes = 0; } } #if CMK_SMP && ( CMK_CRAYXE || CMK_CRAYXC ) if (PMI_Barrier() != GNI_RC_SUCCESS) return; #else if (CmiBarrier() != 0) return; #endif freeSharedBuffers(); pxshm_freed = 1; }
void allocBufNameStrings(char ***bufName){ int i,count; int totalAlloc = sizeof(char)*NAMESTRLEN*(pxshmContext->nodesize-1); char *tmp = malloc(totalAlloc); MACHSTATE2(3,"allocBufNameStrings tmp %p totalAlloc %d",tmp,totalAlloc); *bufName = (char **)malloc(sizeof(char *)*pxshmContext->nodesize); for(i=0,count=0;i<pxshmContext->nodesize;i++){ if(i != pxshmContext->noderank){ (*bufName)[i] = &(tmp[count*NAMESTRLEN*sizeof(char)]); count++; }else{ (*bufName)[i] = NULL; } } }
/****************** * Initialization routine * currently just testing start up * ****************/ void CmiInitPxshm(char **argv){ char *env; MACHSTATE(3,"CminitPxshm start"); pxshmContext = (PxshmContext *)calloc(1,sizeof(PxshmContext)); calculateNodeSizeAndRank(argv); if(pxshmContext->nodesize == 1) return; MACHSTATE1(3,"CminitPxshm %d calculateNodeSizeAndRank",pxshmContext->nodesize); env = getenv("CHARM_PXSHM_POOL_SIZE"); if (env) { SHMBUFLEN = CmiReadSize(env); } env = getenv("CHARM_PXSHM_MESSAGE_MAX_SIZE"); if (env) { SHMMAXSIZE = CmiReadSize(env); } if (SHMMAXSIZE > SHMBUFLEN) CmiAbort("Error> Pxshm pool size is set too small in env variable CHARM_PXSHM_POOL_SIZE"); SENDQSTARTSIZE = 32 * pxshmContext->nodesize; if (_Cmi_mynode == 0) printf("Charm++> pxshm enabled: %d cores per node, buffer size: %.1fMB\n", pxshmContext->nodesize, SHMBUFLEN/1024.0/1024.0); #if CMK_CRAYXE || CMK_CRAYXC srand(getpid()); int Cmi_charmrun_pid = rand(); PMI_Bcast(&Cmi_charmrun_pid, sizeof(int)); snprintf(&(pxshmContext->prefixStr[0]),PREFIXSTRLEN-1,"charm_pxshm_%d",Cmi_charmrun_pid); #endif MACHSTATE2(3,"CminitPxshm %s %d pre setupSharedBuffers",pxshmContext->prefixStr,pxshmContext->nodesize); setupSharedBuffers(); MACHSTATE2(3,"CminitPxshm %s %d setupSharedBuffers",pxshmContext->prefixStr,pxshmContext->nodesize); initAllSendQs(); MACHSTATE2(3,"CminitPxshm %s %d initAllSendQs",pxshmContext->prefixStr,pxshmContext->nodesize); MACHSTATE2(3,"CminitPxshm %s %d done",pxshmContext->prefixStr,pxshmContext->nodesize); #if PXSHM_STATS pxshmContext->sendCount=0; pxshmContext->sendTime=0.0; pxshmContext->validCheckCount=0; pxshmContext->validCheckTime=0.0; pxshmContext->commServerTime = 0; pxshmContext->lockRecvCount = 0; #endif signal(SIGSEGV, cleanupOnAllSigs); signal(SIGFPE, cleanupOnAllSigs); signal(SIGILL, cleanupOnAllSigs); signal(SIGTERM, cleanupOnAllSigs); signal(SIGABRT, cleanupOnAllSigs); signal(SIGQUIT, cleanupOnAllSigs); signal(SIGBUS, cleanupOnAllSigs); signal(SIGINT, cleanupOnAllSigs); signal(SIGTRAP, cleanupOnAllSigs); #if 0 char name[64]; gethostname(name,64); printf("[%d] name: %s\n", myrank, name); #endif };
/** * Returns 1 if this "msg" is an out-of-order message, or * this "msg" is a late message which triggers the process * of all buffered ooo msgs. * --Chao Mei */ static int checkMsgInOrder(char *msg, MsgOrderInfo *info) { int srcpe, destrank; int incomingSeqNo, expectedSeqNo; int curOffset, maxOffset; int i, curWinSize; void **destMsgBuffer = NULL; /* numMsg is the number of msgs to be processed in this buffer*/ /* Reason to have this extra copy of msgs to be processed: Reduce the atomic granularity */ void **toProcessMsgBuffer; int numMsgs = 0; srcpe = CMI_MSG_SRCPE(msg); destrank = CMI_DEST_RANK(msg); incomingSeqNo = CMI_MSG_SEQNO(msg); CmiLock(cmplHdlrThdLock); expectedSeqNo = getNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe); if (expectedSeqNo == incomingSeqNo) { /* Two cases: has ooo msg buffered or not */ maxOffset = (info->oooMaxOffset)[srcpe]; if (maxOffset>0) { MACHSTATE1(4, "Processing all buffered ooo msgs (maxOffset=%d) including the just recved begin {", maxOffset); curWinSize = info->CUR_WINDOW_SIZE[srcpe]; toProcessMsgBuffer = malloc((curWinSize+1)*sizeof(void *)); /* process the msg just recved */ toProcessMsgBuffer[numMsgs++] = msg; /* process the buffered ooo msg until the first empty slot in the window */ destMsgBuffer = (info->oooMsgBuffer)[srcpe]; for (curOffset=0; curOffset<maxOffset; curOffset++) { char *curMsg = destMsgBuffer[curOffset]; if (curMsg == NULL) { CmiAssert(curOffset!=(maxOffset-1)); break; } toProcessMsgBuffer[numMsgs++] = curMsg; destMsgBuffer[curOffset] = NULL; } /* Update expected seqno, maxOffset and slide the window */ if (curOffset < maxOffset) { int i; /** * now, the seqno of the next to-be-recved msg should be * "expectedSeqNo+curOffset+1" as the seqno of the just * processed msg is "expectedSeqNo+curOffset. We need to slide * the msg buffer window from "curOffset+1" because the first * element of the buffer window should always points to the ooo * msg that's 1 in terms of seqno ahead of the next to-be-recved * msg. --Chao Mei */ /* moving [curOffset+1, maxOffset) to [0, maxOffset-curOffset-1) in the window */ /* The following two loops could be combined --Chao Mei */ for (i=0; i<maxOffset-curOffset-1; i++) { destMsgBuffer[i] = destMsgBuffer[curOffset+i+1]; } for (i=maxOffset-curOffset-1; i<maxOffset; i++) { destMsgBuffer[i] = NULL; } (info->oooMaxOffset)[srcpe] = maxOffset-curOffset-1; setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo+curOffset); } else { /* there's no remaining buffered ooo msgs */ (info->oooMaxOffset)[srcpe] = 0; setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo+maxOffset); } CmiUnlock(cmplHdlrThdLock); /* Process the msgs */ for (i=0; i<numMsgs; i++) { char *curMsg = toProcessMsgBuffer[i]; if (CMI_BROADCAST_ROOT(curMsg)>0) { #if CMK_OFFLOAD_BCAST_PROCESS PCQueuePush(CsvAccess(procBcastQ), curMsg); #else processProcBcastMsg(CMI_MSG_SIZE(curMsg), curMsg); #endif } else { CmiPushPE(CMI_DEST_RANK(curMsg), curMsg); } } free(toProcessMsgBuffer); MACHSTATE1(4, "Processing all buffered ooo msgs (actually processed %d) end }", curOffset); /** * Since we have processed all buffered ooo msgs including * this just recved one, 1 should be returned so that this * msg no longer needs processing */ return 1; } else { /* An expected msg recved without any ooo msg buffered */ MACHSTATE1(4, "Receiving an expected msg with seqno=%d\n", incomingSeqNo); setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo); CmiUnlock(cmplHdlrThdLock); return 0; } } MACHSTATE2(4, "Receiving an out-of-order msg with seqno=%d, but expect seqno=%d", incomingSeqNo, expectedSeqNo); curWinSize = info->CUR_WINDOW_SIZE[srcpe]; if ((info->oooMsgBuffer)[srcpe]==NULL) { (info->oooMsgBuffer)[srcpe] = malloc(curWinSize*sizeof(void *)); memset((info->oooMsgBuffer)[srcpe], 0, curWinSize*sizeof(void *)); } destMsgBuffer = (info->oooMsgBuffer)[srcpe]; curOffset = incomingSeqNo - expectedSeqNo; maxOffset = (info->oooMaxOffset)[srcpe]; if (curOffset<0) { /* It's possible that the seqNo starts with another round (exceeding MAX_MSG_SEQNO) with 1 */ curOffset += MAX_MSG_SEQNO; } if (curOffset > curWinSize) { int newWinSize; if (curOffset > MAX_WINDOW_SIZE) { CmiAbort("Exceeding the MAX_WINDOW_SIZE!\n"); } newWinSize = ((curOffset/curWinSize)+1)*curWinSize; /*CmiPrintf("[%d]: WARNING: INCREASING WINDOW SIZE FROM %d TO %d\n", CmiMyPe(), curWinSize, newWinSize);*/ (info->oooMsgBuffer)[srcpe] = malloc(newWinSize*sizeof(void *)); memset((info->oooMsgBuffer)[srcpe], 0, newWinSize*sizeof(void *)); memcpy((info->oooMsgBuffer)[srcpe], destMsgBuffer, curWinSize*sizeof(void *)); info->CUR_WINDOW_SIZE[srcpe] = newWinSize; free(destMsgBuffer); destMsgBuffer = (info->oooMsgBuffer)[srcpe]; } CmiAssert(destMsgBuffer[curOffset-1] == NULL); destMsgBuffer[curOffset-1] = msg; if (curOffset > maxOffset) (info->oooMaxOffset)[srcpe] = curOffset; CmiUnlock(cmplHdlrThdLock); return 1; }
/* The following two are callbacks for sync and async send respectively */ static void ReleaseMsg(lapi_handle_t *myLapiContext, void *msg, lapi_sh_info_t *info) { MACHSTATE2(2,"[%d] ReleaseMsg begin %p {",CmiMyNode(),msg); check_lapi_err(info->reason, "ReleaseMsg", __LINE__); CmiFree(msg); MACHSTATE(2,"} ReleaseMsg end"); }