void CmiMachineProgressImpl() { if (!CsvAccess(lapiInterruptMode)) check_lapi(LAPI_Probe,(lapiContext)); #if CMK_IMMEDIATE_MSG MACHSTATE1(2, "[%d] Handling Immediate Message begin {",CmiMyNode()); CmiHandleImmediate(); MACHSTATE1(2, "[%d] Handling Immediate Message end }",CmiMyNode()); #endif #if CMK_SMP && !CMK_SMP_NO_COMMTHD && CMK_OFFLOAD_BCAST_PROCESS if (CmiMyRank()==CmiMyNodeSize()) processBcastQs(); /* FIXME ????????????????*/ #endif }
/****************** * Initialization routine * currently just testing start up * ****************/ void CmiInitSysvshm(char **argv){ MACHSTATE(3,"CminitSysvshm start"); sysvshmContext = (SysvshmContext *)malloc(sizeof(SysvshmContext)); if(Cmi_charmrun_pid <= 0){ CmiAbort("sysvshm must be run with charmrun"); } calculateNodeSizeAndRank(argv); if(sysvshmContext->nodesize == 1){ return; } MACHSTATE1(3,"CminitSysvshm %d calculateNodeSizeAndRank",sysvshmContext->nodesize); setupSharedBuffers(); MACHSTATE2(3,"CminitSysvshm %d %d setupSharedBuffers",Cmi_charmrun_pid,sysvshmContext->nodesize); initAllSendQs(); MACHSTATE2(3,"CminitSysvshm %d %d initAllSendQs",Cmi_charmrun_pid,sysvshmContext->nodesize); MACHSTATE2(3,"CminitSysvshm %d %d done",Cmi_charmrun_pid,sysvshmContext->nodesize); #if SYSVSHM_STATS sysvshmContext->sendCount=0; sysvshmContext->sendTime=0.0; sysvshmContext->validCheckCount=0; sysvshmContext->validCheckTime=0.0; sysvshmContext->commServerTime = 0; sysvshmContext->lockRecvCount = 0; #endif };
CmiCommHandle CmiAsyncBroadcastFn(int size, char *msg) { #if ENSURE_MSG_PAIRORDER /* Not sure how to add the msg seq no for async broadcast messages --Chao Mei */ /* so abort here ! */ CmiAssert(0); return 0; #else int i, rank; int mype = CmiMyPe(); #if ENABLE_CONVERSE_QD CQdCreate(CpvAccess(cQdState), CmiNumPes()-1); #endif MACHSTATE1(3,"[%d] Sending async broadcast message from {",CmiMyNode()); CMI_BROADCAST_ROOT(msg) = 0; void *handle = malloc(sizeof(int)); *((int *)handle) = CmiNumPes()-1; for (i=mype+1; i<CmiNumPes(); i++) { CMI_DEST_RANK(msg) = CmiRankOf(i); lapiSendFn(CmiNodeOf(i), size, msg, DeliveredMsg, handle); } for (i=0; i<mype; i++) { CMI_DEST_RANK(msg) = CmiRankOf(i); lapiSendFn(CmiNodeOf(i), size, msg, DeliveredMsg, handle); } MACHSTATE(3,"} Sending async broadcast message end"); return handle; #endif }
inline void emptyAllRecvBufs(){ struct sembuf sb; int i; int j,ret; union semun { int val; struct semid_ds *buf; ushort array[1]; } arg; for(i=0;i<sysvshmContext->nodesize;i++){ if(i != sysvshmContext->noderank){ sharedBufData *recvBuf = &(sysvshmContext->recvBufs[i]); if(recvBuf->header->count > 0){ #if SYSVSHM_STATS sysvshmContext->lockRecvCount++; #endif ACQUIRE(i); if(semop(recvBuf->semid, &sb, 1)>=0) { MACHSTATE1(3,"emptyRecvBuf to be called for rank %d",i); emptyRecvBuf(recvBuf); RELEASE(i); CmiAssert((semop(recvBuf->semid, &sb, 1))>=0); } } } } };
static void CmiNotifyStillIdle(CmiIdleState *s) { #if CMK_SHARED_VARS_UNAVAILABLE /*No comm. thread-- listen on sockets for incoming messages*/ MACHSTATE(1,"idle commserver {") CommunicationServer(Cmi_idlepoll?0:10, COMM_SERVER_FROM_SMP); MACHSTATE(1,"} idle commserver") #else #if CMK_SHARED_VARS_POSIX_THREADS_SMP if(_Cmi_sleepOnIdle ){ #endif int nSpins=20; /*Number of times to spin before sleeping*/ s->nIdles++; if (s->nIdles>nSpins) { /*Start giving some time back to the OS*/ s->sleepMs+=2; if (s->sleepMs>10) s->sleepMs=10; } /*Comm. thread will listen on sockets-- just sleep*/ if (s->sleepMs>0) { MACHSTATE1(3,"idle lock(%d) {",CmiMyPe()) CmiIdleLock_sleep(&s->cs->idle,s->sleepMs); CsdResetPeriodic(); /* check ccd callbacks when I am awakened */ MACHSTATE1(3,"} idle lock(%d)",CmiMyPe()) } #if CMK_SHARED_VARS_POSIX_THREADS_SMP }
void CmiSendMessageSysvshm(OutgoingMsg ogm,OtherNode node,int rank,unsigned int broot){ struct sembuf sb; #if SYSVSHM_STATS double _startSendTime = CmiWallTimer(); #endif int dstRank = SysvshmRank(ogm->dst); MEMDEBUG(CmiMemoryCheck()); DgramHeaderMake(ogm->data,rank,ogm->src,Cmi_charmrun_pid,1, broot); MACHSTATE4(3,"Send Msg Sysvshm ogm %p size %d dst %d dstRank %d",ogm,ogm->size,ogm->dst,dstRank); CmiAssert(dstRank >=0 && dstRank != sysvshmContext->noderank); sharedBufData *dstBuf = &(sysvshmContext->sendBufs[dstRank]); ACQUIRENW(sysvshmContext->noderank); if(semop(dstBuf->semid, &sb, 1)<0) { /**failed to get the lock insert into q and retain the message*/ pushSendQ(sysvshmContext->sendQs[dstRank],ogm); ogm->refcount++; MEMDEBUG(CmiMemoryCheck()); return; }else{ /*** * We got the lock for this buffer * first write all the messages in the sendQ and then write this guy * */ if(sysvshmContext->sendQs[dstRank]->numEntries == 0){ /* send message user event */ int ret = sendMessage(ogm,dstBuf,sysvshmContext->sendQs[dstRank]); MACHSTATE(3,"Sysvshm Send succeeded immediately"); }else{ ogm->refcount+=2;/*this message should not get deleted when the queue is flushed*/ pushSendQ(sysvshmContext->sendQs[dstRank],ogm); MACHSTATE3(3,"Sysvshm ogm %p pushed to sendQ length %d refcount %d",ogm,sysvshmContext->sendQs[dstRank]->numEntries,ogm->refcount); int sent = flushSendQ(dstRank); ogm->refcount--; /*if it has been sent, can be deleted by caller, if not will be deleted when queue is flushed*/ MACHSTATE1(3,"Sysvshm flushSendQ sent %d messages",sent); } /* unlock the recvbuffer*/ RELEASE(sysvshmContext->noderank); CmiAssert(semop(dstBuf->semid, &sb, 1)>=0); } #if SYSVSHM_STATS sysvshmContext->sendCount ++; sysvshmContext->sendTime += (CmiWallTimer()-_startSendTime); #endif MEMDEBUG(CmiMemoryCheck()); };
/** * lapi completion handler on the recv side. It's responsible to push messages * to the destination proc or relay broadcast messages. --Chao Mei * * Note: The completion handler could be executed on any cores within a node ??? * So in SMP mode when there's a comm thread, the completion handler should be carefully * dealt with. * * Given lapi also provides an internal lapi thread to deal with network progress which * will call this function (???), we should be careful with the following situations: * 1) non SMP mode, with interrupt (lapi internal completion thread) * 2) non SMP mode, with polling (machine layer is responsible for network progress) * 3) SMP mode, no comm thread, with polling * 4) SMP mode, no comm thread, with interrupt * 5) SMP mode, with comm thread, with polling (not yet implemented, comm server is empty right now) * 6) SMP mode, with comm thread, with interrupt?? * * Currently, SMP mode without comm thread is undergoing implementation. * * This function is executed by LAPI internal threads. It seems that the number of internal * completion handler threads could vary during the program. LAPI adaptively creates more * threads if there are more outstanding messages!!!! This means pcqueue needs protection * even in the nonsmp case!!!! * * --Chao Mei */ static void PumpMsgsComplete(lapi_handle_t *myLapiContext, void *am_info) { int i; char *msg = am_info; int broot, destrank; MACHSTATE3(2,"[%d] PumpMsgsComplete with msg %p (isImm=%d) begin {",CmiMyNode(), msg, CmiIsImmediate(msg)); #if ENSURE_MSG_PAIRORDER MACHSTATE3(2,"msg %p info: srcpe=%d, seqno=%d", msg, CMI_MSG_SRCPE(msg), CMI_MSG_SEQNO(msg)); #endif /** * First, we check if the msg is a broadcast msg via spanning * tree. If it is, it needs to call SendSpanningChildren to * relay the broadcast, and then send the msg to every cores on * this node. * * After the first check, we deal with normal messages. * --Chao Mei */ /* It's the right place to relay the broadcast message */ /** * 1. For in-order delivery, because this is the handler for * receiving a message, and we assume the cross-network msgs are * always delivered to the first proc (rank 0) of this node, we * select the srcpe of the bcast msgs and the next msg seq no * correspondingly. * * --Chao Mei */ #if ENSURE_MSG_PAIRORDER broot = CMI_BROADCAST_ROOT(msg); destrank = CMI_DEST_RANK(msg); /* Only check proc-level msgs */ if (broot>=0 #if CMK_NODE_QUEUE_AVAILABLE && destrank != DGRAM_NODEMESSAGE #endif ) { MsgOrderInfo *info; info = &CpvAccessOther(p2pMsgSeqInfo, destrank); MACHSTATE1(2, "Check msg in-order for p2p msg %p", msg); if (checkMsgInOrder(msg,info)) { MACHSTATE(2,"} PumpMsgsComplete end "); return; } } #endif handleOneRecvedMsg(CMI_MSG_SIZE(msg), msg); MACHSTATE(2,"} PumpMsgsComplete end "); return; }
void calculateNodeSizeAndRank(char **argv){ sysvshmContext->nodesize=1; MACHSTATE(3,"calculateNodeSizeAndRank start"); CmiGetArgIntDesc(argv, "+nodesize", &(sysvshmContext->nodesize),"Number of cores in this node"); MACHSTATE1(3,"calculateNodeSizeAndRank argintdesc %d",sysvshmContext->nodesize); sysvshmContext->noderank = _Cmi_mynode % (sysvshmContext->nodesize); MACHSTATE1(3,"calculateNodeSizeAndRank noderank %d",sysvshmContext->noderank); sysvshmContext->nodestart = _Cmi_mynode -sysvshmContext->noderank; MACHSTATE(3,"calculateNodeSizeAndRank nodestart "); sysvshmContext->nodeend = sysvshmContext->nodestart + sysvshmContext->nodesize -1; if(sysvshmContext->nodeend >= _Cmi_numnodes){ sysvshmContext->nodeend = _Cmi_numnodes-1; sysvshmContext->nodesize = (sysvshmContext->nodeend - sysvshmContext->nodestart) +1; } MACHSTATE3(3,"calculateNodeSizeAndRank nodestart %d nodesize %d noderank %d",sysvshmContext->nodestart,sysvshmContext->nodesize,sysvshmContext->noderank); }
void CmiNodeStateInit(CmiNodeState *nodeState) { MACHSTATE1(4,"NodeStateInit %p", nodeState) #if CMK_IMMEDIATE_MSG nodeState->immSendLock = CmiCreateLock(); nodeState->immRecvLock = CmiCreateLock(); nodeState->immQ = CMIQueueCreate(); nodeState->delayedImmQ = CMIQueueCreate(); #endif #if CMK_NODE_QUEUE_AVAILABLE nodeState->CmiNodeRecvLock = CmiCreateLock(); nodeState->NodeRecv = CMIQueueCreate(); #endif MACHSTATE(4,"NodeStateInit done") }
void calculateNodeSizeAndRank(char **argv){ pxshmContext->nodesize=1; MACHSTATE(3,"calculateNodeSizeAndRank start"); //CmiGetArgIntDesc(argv, "+nodesize", &(pxshmContext->nodesize),"Number of cores in this node (for non-smp case).Used by the shared memory communication layer"); CmiGetArgIntDesc(argv, "+nodesize", &(pxshmContext->nodesize),"Number of cores in this node"); MACHSTATE1(3,"calculateNodeSizeAndRank argintdesc %d",pxshmContext->nodesize); pxshmContext->noderank = _Cmi_mynode % (pxshmContext->nodesize); MACHSTATE1(3,"calculateNodeSizeAndRank noderank %d",pxshmContext->noderank); pxshmContext->nodestart = _Cmi_mynode -pxshmContext->noderank; MACHSTATE(3,"calculateNodeSizeAndRank nodestart "); pxshmContext->nodeend = pxshmContext->nodestart + pxshmContext->nodesize -1; if(pxshmContext->nodeend >= _Cmi_numnodes){ pxshmContext->nodeend = _Cmi_numnodes-1; pxshmContext->nodesize = (pxshmContext->nodeend - pxshmContext->nodestart) +1; } MACHSTATE3(3,"calculateNodeSizeAndRank nodestart %d nodesize %d noderank %d",pxshmContext->nodestart,pxshmContext->nodesize,pxshmContext->noderank); }
/** lapi header handler: executed on the recv side, when the * first packet of the recving msg arrives, it is called to * prepare the memory buffer in the user space for recving the * data --Chao Mei */ static void* PumpMsgsBegin(lapi_handle_t *myLapiContext, void *hdr, uint *uhdr_len, lapi_return_info_t *msg_info, compl_hndlr_t **comp_h, void **comp_am_info) { void *msg_buf; MACHSTATE1(2,"[%d] PumpMsgsBegin begin {",CmiMyNode()); /* prepare the space for receiving the data, set the completion handler to be executed inline */ msg_buf = (void *)CmiAlloc(msg_info->msg_len); msg_info->ret_flags = LAPI_SEND_REPLY; *comp_h = PumpMsgsComplete; *comp_am_info = msg_buf; MACHSTATE(2,"} PumpMsgsBegin end"); return msg_buf; }
inline void flushAllSendQs(){ struct sembuf sb; int i=0; for(i=0;i<sysvshmContext->nodesize;i++){ if(i != sysvshmContext->noderank && sysvshmContext->sendQs[i]->numEntries > 0){ ACQUIRE(sysvshmContext->noderank); if(semop(sysvshmContext->sendBufs[i].semid, &sb, 1)>=0) { MACHSTATE1(3,"flushSendQ %d",i); flushSendQ(i); RELEASE(sysvshmContext->noderank); CmiAssert(semop(sysvshmContext->sendBufs[i].semid, &sb, 1)>=0); } } } };
CmiCommHandle CmiAsyncNodeBroadcastFn(int size, char *msg) { int i; #if ENABLE_CONVERSE_QD CQdCreate(CpvAccess(cQdState), CmiNumNodes()-1); #endif MACHSTATE1(3,"[%d] Sending async node broadcast message from {",CmiMyNode()); CMI_BROADCAST_ROOT(msg) = 0; CMI_DEST_RANK(msg) =DGRAM_NODEMESSAGE; void *handle = malloc(sizeof(int)); *((int *)handle) = CmiNumNodes()-1; for (i=CmiMyNode()+1; i<CmiNumNodes(); i++) { lapiSendFn(i, size, msg, DeliveredMsg, handle); } for (i=0; i<CmiMyNode(); i++) { lapiSendFn(i, size, msg, DeliveredMsg, handle); } MACHSTATE(3,"} Sending async broadcast message end"); return handle; }
/****************** * Initialization routine * currently just testing start up * ****************/ void CmiInitPxshm(char **argv){ char *env; MACHSTATE(3,"CminitPxshm start"); pxshmContext = (PxshmContext *)calloc(1,sizeof(PxshmContext)); calculateNodeSizeAndRank(argv); if(pxshmContext->nodesize == 1) return; MACHSTATE1(3,"CminitPxshm %d calculateNodeSizeAndRank",pxshmContext->nodesize); env = getenv("CHARM_PXSHM_POOL_SIZE"); if (env) { SHMBUFLEN = CmiReadSize(env); } env = getenv("CHARM_PXSHM_MESSAGE_MAX_SIZE"); if (env) { SHMMAXSIZE = CmiReadSize(env); } if (SHMMAXSIZE > SHMBUFLEN) CmiAbort("Error> Pxshm pool size is set too small in env variable CHARM_PXSHM_POOL_SIZE"); SENDQSTARTSIZE = 32 * pxshmContext->nodesize; if (_Cmi_mynode == 0) printf("Charm++> pxshm enabled: %d cores per node, buffer size: %.1fMB\n", pxshmContext->nodesize, SHMBUFLEN/1024.0/1024.0); #if CMK_CRAYXE || CMK_CRAYXC srand(getpid()); int Cmi_charmrun_pid = rand(); PMI_Bcast(&Cmi_charmrun_pid, sizeof(int)); snprintf(&(pxshmContext->prefixStr[0]),PREFIXSTRLEN-1,"charm_pxshm_%d",Cmi_charmrun_pid); #endif MACHSTATE2(3,"CminitPxshm %s %d pre setupSharedBuffers",pxshmContext->prefixStr,pxshmContext->nodesize); setupSharedBuffers(); MACHSTATE2(3,"CminitPxshm %s %d setupSharedBuffers",pxshmContext->prefixStr,pxshmContext->nodesize); initAllSendQs(); MACHSTATE2(3,"CminitPxshm %s %d initAllSendQs",pxshmContext->prefixStr,pxshmContext->nodesize); MACHSTATE2(3,"CminitPxshm %s %d done",pxshmContext->prefixStr,pxshmContext->nodesize); #if PXSHM_STATS pxshmContext->sendCount=0; pxshmContext->sendTime=0.0; pxshmContext->validCheckCount=0; pxshmContext->validCheckTime=0.0; pxshmContext->commServerTime = 0; pxshmContext->lockRecvCount = 0; #endif signal(SIGSEGV, cleanupOnAllSigs); signal(SIGFPE, cleanupOnAllSigs); signal(SIGILL, cleanupOnAllSigs); signal(SIGTERM, cleanupOnAllSigs); signal(SIGABRT, cleanupOnAllSigs); signal(SIGQUIT, cleanupOnAllSigs); signal(SIGBUS, cleanupOnAllSigs); signal(SIGINT, cleanupOnAllSigs); signal(SIGTRAP, cleanupOnAllSigs); #if 0 char name[64]; gethostname(name,64); printf("[%d] name: %s\n", myrank, name); #endif };
static void DeliveredMsg(lapi_handle_t *myLapiContext, void *msg, lapi_sh_info_t *info) { MACHSTATE1(2,"[%d] DeliveredMsg begin {",CmiMyNode()); check_lapi_err(info->reason, "DeliveredMsg", __LINE__); *((int *)msg) = *((int *)msg) - 1; MACHSTATE(2,"} DeliveredMsg end"); }
/** * Returns 1 if this "msg" is an out-of-order message, or * this "msg" is a late message which triggers the process * of all buffered ooo msgs. * --Chao Mei */ static int checkMsgInOrder(char *msg, MsgOrderInfo *info) { int srcpe, destrank; int incomingSeqNo, expectedSeqNo; int curOffset, maxOffset; int i, curWinSize; void **destMsgBuffer = NULL; /* numMsg is the number of msgs to be processed in this buffer*/ /* Reason to have this extra copy of msgs to be processed: Reduce the atomic granularity */ void **toProcessMsgBuffer; int numMsgs = 0; srcpe = CMI_MSG_SRCPE(msg); destrank = CMI_DEST_RANK(msg); incomingSeqNo = CMI_MSG_SEQNO(msg); CmiLock(cmplHdlrThdLock); expectedSeqNo = getNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe); if (expectedSeqNo == incomingSeqNo) { /* Two cases: has ooo msg buffered or not */ maxOffset = (info->oooMaxOffset)[srcpe]; if (maxOffset>0) { MACHSTATE1(4, "Processing all buffered ooo msgs (maxOffset=%d) including the just recved begin {", maxOffset); curWinSize = info->CUR_WINDOW_SIZE[srcpe]; toProcessMsgBuffer = malloc((curWinSize+1)*sizeof(void *)); /* process the msg just recved */ toProcessMsgBuffer[numMsgs++] = msg; /* process the buffered ooo msg until the first empty slot in the window */ destMsgBuffer = (info->oooMsgBuffer)[srcpe]; for (curOffset=0; curOffset<maxOffset; curOffset++) { char *curMsg = destMsgBuffer[curOffset]; if (curMsg == NULL) { CmiAssert(curOffset!=(maxOffset-1)); break; } toProcessMsgBuffer[numMsgs++] = curMsg; destMsgBuffer[curOffset] = NULL; } /* Update expected seqno, maxOffset and slide the window */ if (curOffset < maxOffset) { int i; /** * now, the seqno of the next to-be-recved msg should be * "expectedSeqNo+curOffset+1" as the seqno of the just * processed msg is "expectedSeqNo+curOffset. We need to slide * the msg buffer window from "curOffset+1" because the first * element of the buffer window should always points to the ooo * msg that's 1 in terms of seqno ahead of the next to-be-recved * msg. --Chao Mei */ /* moving [curOffset+1, maxOffset) to [0, maxOffset-curOffset-1) in the window */ /* The following two loops could be combined --Chao Mei */ for (i=0; i<maxOffset-curOffset-1; i++) { destMsgBuffer[i] = destMsgBuffer[curOffset+i+1]; } for (i=maxOffset-curOffset-1; i<maxOffset; i++) { destMsgBuffer[i] = NULL; } (info->oooMaxOffset)[srcpe] = maxOffset-curOffset-1; setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo+curOffset); } else { /* there's no remaining buffered ooo msgs */ (info->oooMaxOffset)[srcpe] = 0; setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo+maxOffset); } CmiUnlock(cmplHdlrThdLock); /* Process the msgs */ for (i=0; i<numMsgs; i++) { char *curMsg = toProcessMsgBuffer[i]; if (CMI_BROADCAST_ROOT(curMsg)>0) { #if CMK_OFFLOAD_BCAST_PROCESS PCQueuePush(CsvAccess(procBcastQ), curMsg); #else processProcBcastMsg(CMI_MSG_SIZE(curMsg), curMsg); #endif } else { CmiPushPE(CMI_DEST_RANK(curMsg), curMsg); } } free(toProcessMsgBuffer); MACHSTATE1(4, "Processing all buffered ooo msgs (actually processed %d) end }", curOffset); /** * Since we have processed all buffered ooo msgs including * this just recved one, 1 should be returned so that this * msg no longer needs processing */ return 1; } else { /* An expected msg recved without any ooo msg buffered */ MACHSTATE1(4, "Receiving an expected msg with seqno=%d\n", incomingSeqNo); setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo); CmiUnlock(cmplHdlrThdLock); return 0; } } MACHSTATE2(4, "Receiving an out-of-order msg with seqno=%d, but expect seqno=%d", incomingSeqNo, expectedSeqNo); curWinSize = info->CUR_WINDOW_SIZE[srcpe]; if ((info->oooMsgBuffer)[srcpe]==NULL) { (info->oooMsgBuffer)[srcpe] = malloc(curWinSize*sizeof(void *)); memset((info->oooMsgBuffer)[srcpe], 0, curWinSize*sizeof(void *)); } destMsgBuffer = (info->oooMsgBuffer)[srcpe]; curOffset = incomingSeqNo - expectedSeqNo; maxOffset = (info->oooMaxOffset)[srcpe]; if (curOffset<0) { /* It's possible that the seqNo starts with another round (exceeding MAX_MSG_SEQNO) with 1 */ curOffset += MAX_MSG_SEQNO; } if (curOffset > curWinSize) { int newWinSize; if (curOffset > MAX_WINDOW_SIZE) { CmiAbort("Exceeding the MAX_WINDOW_SIZE!\n"); } newWinSize = ((curOffset/curWinSize)+1)*curWinSize; /*CmiPrintf("[%d]: WARNING: INCREASING WINDOW SIZE FROM %d TO %d\n", CmiMyPe(), curWinSize, newWinSize);*/ (info->oooMsgBuffer)[srcpe] = malloc(newWinSize*sizeof(void *)); memset((info->oooMsgBuffer)[srcpe], 0, newWinSize*sizeof(void *)); memcpy((info->oooMsgBuffer)[srcpe], destMsgBuffer, curWinSize*sizeof(void *)); info->CUR_WINDOW_SIZE[srcpe] = newWinSize; free(destMsgBuffer); destMsgBuffer = (info->oooMsgBuffer)[srcpe]; } CmiAssert(destMsgBuffer[curOffset-1] == NULL); destMsgBuffer[curOffset-1] = msg; if (curOffset > maxOffset) (info->oooMaxOffset)[srcpe] = curOffset; CmiUnlock(cmplHdlrThdLock); return 1; }
void CmiSendMessagePxshm(char *msg, int size, int dstnode, int *refcount) { #if PXSHM_STATS double _startSendTime = CmiWallTimer(); #endif LrtsPrepareEnvelope(msg, size); int dstRank = PxshmRank(dstnode); MEMDEBUG(CmiMemoryCheck()); /* MACHSTATE4(3,"Send Msg Pxshm ogm %p size %d dst %d dstRank %d",ogm,ogm->size,ogm->dst,dstRank); MACHSTATE4(3,"Send Msg Pxshm ogm %p size %d dst %d dstRank %d",ogm,ogm->size,ogm->dst,dstRank); */ CmiAssert(dstRank >=0 && dstRank != pxshmContext->noderank); sharedBufData *dstBuf = &(pxshmContext->sendBufs[dstRank]); PxshmSendQ *sendQ = pxshmContext->sendQs[dstRank]; #if PXSHM_OSSPINLOCK if(! OSSpinLockTry(&dstBuf->header->lock)){ #elif PXSHM_LOCK if(sem_trywait(dstBuf->mutex) < 0){ #elif PXSHM_FENCE dstBuf->header->flagSender = 1; dstBuf->header->turn = RECEIVER; CmiMemoryReadFence(0,0); CmiMemoryWriteFence(0,0); //if(dstBuf->header->flagReceiver && dstBuf->header->turn == RECEIVER){ if(dstBuf->header->flagReceiver){ dstBuf->header->flagSender = 0; #endif /**failed to get the lock insert into q and retain the message*/ #if SENDQ_LIST if (sendQ->numEntries == 0 && sendQ->next == -2) { sendQ->next = sendQ_head_index; sendQ_head_index = dstRank; } #endif pushSendQ(pxshmContext->sendQs[dstRank], msg, size, refcount); (*refcount)++; MEMDEBUG(CmiMemoryCheck()); return; }else{ /*** * We got the lock for this buffer * first write all the messages in the sendQ and then write this guy * */ if(pxshmContext->sendQs[dstRank]->numEntries == 0){ // send message user event int ret = sendMessage(msg,size,refcount,dstBuf,pxshmContext->sendQs[dstRank]); #if SENDQ_LIST if (sendQ->numEntries > 0 && sendQ->next == -2) { sendQ->next = sendQ_head_index; sendQ_head_index = dstRank; } #endif MACHSTATE(3,"Pxshm Send succeeded immediately"); }else{ (*refcount)+=2;/*this message should not get deleted when the queue is flushed*/ pushSendQ(pxshmContext->sendQs[dstRank],msg,size,refcount); // MACHSTATE3(3,"Pxshm ogm %p pushed to sendQ length %d refcount %d",ogm,pxshmContext->sendQs[dstRank]->numEntries,ogm->refcount); int sent = flushSendQ(sendQ); (*refcount)--; /*if it has been sent, can be deleted by caller, if not will be deleted when queue is flushed*/ MACHSTATE1(3,"Pxshm flushSendQ sent %d messages",sent); } /* unlock the recvbuffer*/ #if PXSHM_OSSPINLOCK OSSpinLockUnlock(&dstBuf->header->lock); #elif PXSHM_LOCK sem_post(dstBuf->mutex); #elif PXSHM_FENCE CmiMemoryReadFence(0,0); CmiMemoryWriteFence(0,0); dstBuf->header->flagSender = 0; #endif } #if PXSHM_STATS pxshmContext->sendCount ++; pxshmContext->sendTime += (CmiWallTimer()-_startSendTime); #endif MEMDEBUG(CmiMemoryCheck()); }; inline void emptyAllRecvBufs(); inline void flushAllSendQs(); /********** * Extract all the messages from the recvBuffers you can * Flush all sendQs * ***/ inline void CommunicationServerPxshm(){ #if PXSHM_STATS double _startCommServerTime =CmiWallTimer(); #endif MEMDEBUG(CmiMemoryCheck()); emptyAllRecvBufs(); flushAllSendQs(); #if PXSHM_STATS pxshmContext->commServerTime += (CmiWallTimer()-_startCommServerTime); #endif MEMDEBUG(CmiMemoryCheck()); }; static void CmiNotifyStillIdlePxshm(CmiIdleState *s){ CommunicationServerPxshm(); } static void CmiNotifyBeginIdlePxshm(CmiIdleState *s) { CmiNotifyStillIdle(s); }
inline void emptyAllRecvBufs(){ int i; for(i=0;i<pxshmContext->nodesize;i++){ if(i != pxshmContext->noderank){ sharedBufData *recvBuf = &(pxshmContext->recvBufs[i]); if(recvBuf->header->count > 0){ #if PXSHM_STATS pxshmContext->lockRecvCount++; #endif #if PXSHM_OSSPINLOCK if(! OSSpinLockTry(&recvBuf->header->lock)){ #elif PXSHM_LOCK if(sem_trywait(recvBuf->mutex) < 0){ #elif PXSHM_FENCE recvBuf->header->flagReceiver = 1; recvBuf->header->turn = SENDER; CmiMemoryReadFence(0,0); CmiMemoryWriteFence(0,0); //if((recvBuf->header->flagSender && recvBuf->header->turn == SENDER)){ if((recvBuf->header->flagSender)){ recvBuf->header->flagReceiver = 0; #endif }else{ MACHSTATE1(3,"emptyRecvBuf to be called for rank %d",i); emptyRecvBuf(recvBuf); #if PXSHM_OSSPINLOCK OSSpinLockUnlock(&recvBuf->header->lock); #elif PXSHM_LOCK sem_post(recvBuf->mutex); #elif PXSHM_FENCE CmiMemoryReadFence(0,0); CmiMemoryWriteFence(0,0); recvBuf->header->flagReceiver = 0; #endif } } } } }; inline void flushAllSendQs(){ int i; #if SENDQ_LIST int index_prev = -1; i = sendQ_head_index; while (i!= -1) { PxshmSendQ *sendQ = pxshmContext->sendQs[i]; CmiAssert(i != pxshmContext->noderank); if(sendQ->numEntries > 0){ #else for(i=0;i<pxshmContext->nodesize;i++) { if (i == pxshmContext->noderank) continue; PxshmSendQ *sendQ = pxshmContext->sendQs[i]; if(sendQ->numEntries > 0) { #endif #if PXSHM_OSSPINLOCK if(OSSpinLockTry(&pxshmContext->sendBufs[i].header->lock)){ #elif PXSHM_LOCK if(sem_trywait(pxshmContext->sendBufs[i].mutex) >= 0){ #elif PXSHM_FENCE pxshmContext->sendBufs[i].header->flagSender = 1; pxshmContext->sendBufs[i].header->turn = RECEIVER; CmiMemoryReadFence(0,0); CmiMemoryWriteFence(0,0); if(!(pxshmContext->sendBufs[i].header->flagReceiver && pxshmContext->sendBufs[i].header->turn == RECEIVER)){ #endif MACHSTATE1(3,"flushSendQ %d",i); flushSendQ(sendQ); #if PXSHM_OSSPINLOCK OSSpinLockUnlock(&pxshmContext->sendBufs[i].header->lock); #elif PXSHM_LOCK sem_post(pxshmContext->sendBufs[i].mutex); #elif PXSHM_FENCE CmiMemoryReadFence(0,0); CmiMemoryWriteFence(0,0); pxshmContext->sendBufs[i].header->flagSender = 0; #endif }else{ #if PXSHM_FENCE pxshmContext->sendBufs[i].header->flagSender = 0; #endif } } #if SENDQ_LIST if (sendQ->numEntries == 0) { if (index_prev != -1) pxshmContext->sendQs[index_prev]->next = sendQ->next; else sendQ_head_index = sendQ->next; i = sendQ->next; sendQ->next = -2; } else { index_prev = i; i = sendQ->next; } #endif } }; void emptyRecvBuf(sharedBufData *recvBuf){ int numMessages = recvBuf->header->count; int i=0; char *ptr=recvBuf->data; for(i=0;i<numMessages;i++){ int size; int rank, srcpe, seqno, magic, i; unsigned int broot; char *msg = ptr; char *newMsg; size = CMI_MSG_SIZE(msg); newMsg = (char *)CmiAlloc(size); memcpy(newMsg,msg,size); handleOneRecvedMsg(size, newMsg); ptr += size; MACHSTATE3(3,"message of size %d recvd ends at ptr-data %d total bytes %d bytes %d",size,ptr-recvBuf->data,recvBuf->header->bytes); } #if 1 if(ptr - recvBuf->data != recvBuf->header->bytes){ CmiPrintf("[%d] ptr - recvBuf->data %d recvBuf->header->bytes %d numMessages %d \n",_Cmi_mynode, ptr - recvBuf->data, recvBuf->header->bytes,numMessages); } #endif CmiAssert(ptr - recvBuf->data == recvBuf->header->bytes); recvBuf->header->count=0; recvBuf->header->bytes=0; } /************************** *sendQ helper functions * ****************/ void initSendQ(PxshmSendQ *q,int size, int rank){ q->data = (OutgoingMsgRec *)calloc(size, sizeof(OutgoingMsgRec)); q->size = size; q->numEntries = 0; q->begin = 0; q->end = 0; q->rank = rank; #if SENDQ_LIST q->next = -2; #endif } void pushSendQ(PxshmSendQ *q, char *msg, int size, int *refcount){ if(q->numEntries == q->size){ //need to resize OutgoingMsgRec *oldData = q->data; int newSize = q->size<<1; q->data = (OutgoingMsgRec *)calloc(newSize, sizeof(OutgoingMsgRec)); //copy head to the beginning of the new array CmiAssert(q->begin == q->end); CmiAssert(q->begin < q->size); memcpy(&(q->data[0]),&(oldData[q->begin]),sizeof(OutgoingMsgRec)*(q->size - q->begin)); if(q->end!=0){ memcpy(&(q->data[(q->size - q->begin)]),&(oldData[0]),sizeof(OutgoingMsgRec)*(q->end)); } free(oldData); q->begin = 0; q->end = q->size; q->size = newSize; } OutgoingMsgRec *omg = &q->data[q->end]; omg->size = size; omg->data = msg; omg->refcount = refcount; (q->end)++; if(q->end >= q->size){ q->end -= q->size; } q->numEntries++; } OutgoingMsgRec * popSendQ(PxshmSendQ *q){ OutgoingMsgRec * ret; if(0 == q->numEntries){ return NULL; } ret = &q->data[q->begin]; (q->begin)++; if(q->begin >= q->size){ q->begin -= q->size; } q->numEntries--; return ret; }