static CmiCommHandle MachineSendFuncForLAPI(int destNode, int size, char *msg, int mode) { scompl_hndlr_t *shdlr = NULL; void *sinfo = NULL; if (mode==P2P_SYNC) { shdlr = ReleaseMsg; sinfo = (void *)msg; } else if (mode==P2P_ASYNC) { shdlr = DeliveredMsg; sinfo = malloc(sizeof(int)); *((int *)sinfo) = 1; } CMI_MSG_SIZE(msg) = size; #if ENSURE_MSG_PAIRORDER #if CMK_NODE_QUEUE_AVAILABLE if (CMI_DEST_RANK(msg) == DGRAM_NODEMESSAGE) { lapiSendFn(destNode, size, msg, shdlr, sinfo); return sinfo; } #endif int destPE = CmiNodeFirst(destNode)+CMI_DEST_RANK(msg); CMI_MSG_SRCPE(msg) = CmiMyPe(); /* Note: This could be executed on comm threads, where CmiMyPe() >= CmiNumPes() */ CMI_MSG_SEQNO(msg) = getNextMsgSeqNo(CpvAccess(p2pMsgSeqInfo).nextMsgSeqNo, destPE); setNextMsgSeqNo(CpvAccess(p2pMsgSeqInfo).nextMsgSeqNo, destPE, CMI_MSG_SEQNO(msg)); #endif lapiSendFn(destNode, size, msg, shdlr, sinfo); return sinfo; }
static void processBcastQs() { #if CMK_OFFLOAD_BCAST_PROCESS char *msg; do { msg = CMIQueuePop(CsvAccess(procBcastQ)); if (!msg) break; MACHSTATE2(4, "[%d]: process a proc-level bcast msg %p begin{", CmiMyNode(), msg); processProcBcastMsg(CMI_MSG_SIZE(msg), msg); MACHSTATE2(4, "[%d]: process a proc-level bcast msg %p end}", CmiMyNode(), msg); } while (1); #if CMK_NODE_QUEUE_AVAILABLE do { msg = CMIQueuePop(CsvAccess(nodeBcastQ)); if (!msg) break; MACHSTATE2(4, "[%d]: process a node-level bcast msg %p begin{", CmiMyNode(), msg); processNodeBcastMsg(CMI_MSG_SIZE(msg), msg); MACHSTATE2(4, "[%d]: process a node-level bcast msg %p end}", CmiMyNode(), msg); } while (1); #endif #endif }
/** * lapi completion handler on the recv side. It's responsible to push messages * to the destination proc or relay broadcast messages. --Chao Mei * * Note: The completion handler could be executed on any cores within a node ??? * So in SMP mode when there's a comm thread, the completion handler should be carefully * dealt with. * * Given lapi also provides an internal lapi thread to deal with network progress which * will call this function (???), we should be careful with the following situations: * 1) non SMP mode, with interrupt (lapi internal completion thread) * 2) non SMP mode, with polling (machine layer is responsible for network progress) * 3) SMP mode, no comm thread, with polling * 4) SMP mode, no comm thread, with interrupt * 5) SMP mode, with comm thread, with polling (not yet implemented, comm server is empty right now) * 6) SMP mode, with comm thread, with interrupt?? * * Currently, SMP mode without comm thread is undergoing implementation. * * This function is executed by LAPI internal threads. It seems that the number of internal * completion handler threads could vary during the program. LAPI adaptively creates more * threads if there are more outstanding messages!!!! This means pcqueue needs protection * even in the nonsmp case!!!! * * --Chao Mei */ static void PumpMsgsComplete(lapi_handle_t *myLapiContext, void *am_info) { int i; char *msg = am_info; int broot, destrank; MACHSTATE3(2,"[%d] PumpMsgsComplete with msg %p (isImm=%d) begin {",CmiMyNode(), msg, CmiIsImmediate(msg)); #if ENSURE_MSG_PAIRORDER MACHSTATE3(2,"msg %p info: srcpe=%d, seqno=%d", msg, CMI_MSG_SRCPE(msg), CMI_MSG_SEQNO(msg)); #endif /** * First, we check if the msg is a broadcast msg via spanning * tree. If it is, it needs to call SendSpanningChildren to * relay the broadcast, and then send the msg to every cores on * this node. * * After the first check, we deal with normal messages. * --Chao Mei */ /* It's the right place to relay the broadcast message */ /** * 1. For in-order delivery, because this is the handler for * receiving a message, and we assume the cross-network msgs are * always delivered to the first proc (rank 0) of this node, we * select the srcpe of the bcast msgs and the next msg seq no * correspondingly. * * --Chao Mei */ #if ENSURE_MSG_PAIRORDER broot = CMI_BROADCAST_ROOT(msg); destrank = CMI_DEST_RANK(msg); /* Only check proc-level msgs */ if (broot>=0 #if CMK_NODE_QUEUE_AVAILABLE && destrank != DGRAM_NODEMESSAGE #endif ) { MsgOrderInfo *info; info = &CpvAccessOther(p2pMsgSeqInfo, destrank); MACHSTATE1(2, "Check msg in-order for p2p msg %p", msg); if (checkMsgInOrder(msg,info)) { MACHSTATE(2,"} PumpMsgsComplete end "); return; } } #endif handleOneRecvedMsg(CMI_MSG_SIZE(msg), msg); MACHSTATE(2,"} PumpMsgsComplete end "); return; }
static void sendPerMsgHandler(char *msg) { int msgSize; void *destAddr, *destSizeAddr; int ep; msgSize = CMI_MSG_SIZE(msg); msgSize -= (2*sizeof(void *)+sizeof(int)); ep = *(int*)(msg+msgSize); destAddr = *(void **)(msg + msgSize + sizeof(int)); destSizeAddr = *(void **)(msg + msgSize + sizeof(int) + sizeof(void*)); /*CmiPrintf("msgSize:%d destAddr:%p, destSizeAddr:%p\n", msgSize, destAddr, destSizeAddr);*/ CmiSetHandler(msg, ep); *((int *)destSizeAddr) = msgSize; memcpy(destAddr, msg, msgSize); }
void CmiSendPersistentMsg(PersistentHandle h, int destPE, int size, void *m) { CmiAssert(h!=NULL); PersistentSendsTable *slot = (PersistentSendsTable *)h; CmiAssert(slot->used == 1); CmiAssert(slot->destPE == destPE); if (size > slot->sizeMax) { CmiPrintf("size: %d sizeMax: %d\n", size, slot->sizeMax); CmiAbort("Abort: Invalid size\n"); } /*CmiPrintf("[%d] CmiSendPersistentMsg h=%p hdl=%d destpe=%d destAddress=%p size=%d\n", CmiMyPe(), *phs, CmiGetHandler(m), slot->destPE, slot->destAddress, size);*/ if (slot->destAddress[0]) { int oldep = CmiGetHandler(m); int newsize = size + sizeof(void *)*2 + sizeof(int); char *newmsg = (char*)CmiAlloc(newsize); memcpy(newmsg, m, size); memcpy(newmsg+size, &oldep, sizeof(int)); memcpy(newmsg+size+sizeof(int), &slot->destAddress[0], sizeof(void *)); memcpy(newmsg+size+sizeof(int)+sizeof(void*), &slot->destSizeAddress[0], sizeof(void *)); CmiFree(m); CMI_MSG_SIZE(data) = size; CmiSetHandler(newmsg, persistentSendMsgHandlerIdx); phs = NULL; phsSize = 0; CmiSyncSendAndFree(slot->destPE, newsize, newmsg); } else { #if 1 /* buffer until ready */ if (slot->messageBuf != NULL) { CmiPrintf("Unexpected message in buffer on %d\n", CmiMyPe()); CmiAbort(""); } slot->messageBuf = m; slot->messageSize = size; #else /* normal send */ PersistentHandle *phs_tmp = phs; int phsSize_tmp = phsSize; phs = NULL; phsSize = 0; CmiPrintf("[%d]Slot sending message directly\n", CmiMyPe()); CmiSyncSendAndFree(slot->destPE, size, m); phs = phs_tmp; phsSize = phsSize_tmp; #endif } }
inline void emptyAllRecvBufs(){ int i; for(i=0;i<pxshmContext->nodesize;i++){ if(i != pxshmContext->noderank){ sharedBufData *recvBuf = &(pxshmContext->recvBufs[i]); if(recvBuf->header->count > 0){ #if PXSHM_STATS pxshmContext->lockRecvCount++; #endif #if PXSHM_OSSPINLOCK if(! OSSpinLockTry(&recvBuf->header->lock)){ #elif PXSHM_LOCK if(sem_trywait(recvBuf->mutex) < 0){ #elif PXSHM_FENCE recvBuf->header->flagReceiver = 1; recvBuf->header->turn = SENDER; CmiMemoryReadFence(0,0); CmiMemoryWriteFence(0,0); //if((recvBuf->header->flagSender && recvBuf->header->turn == SENDER)){ if((recvBuf->header->flagSender)){ recvBuf->header->flagReceiver = 0; #endif }else{ MACHSTATE1(3,"emptyRecvBuf to be called for rank %d",i); emptyRecvBuf(recvBuf); #if PXSHM_OSSPINLOCK OSSpinLockUnlock(&recvBuf->header->lock); #elif PXSHM_LOCK sem_post(recvBuf->mutex); #elif PXSHM_FENCE CmiMemoryReadFence(0,0); CmiMemoryWriteFence(0,0); recvBuf->header->flagReceiver = 0; #endif } } } } }; inline void flushAllSendQs(){ int i; #if SENDQ_LIST int index_prev = -1; i = sendQ_head_index; while (i!= -1) { PxshmSendQ *sendQ = pxshmContext->sendQs[i]; CmiAssert(i != pxshmContext->noderank); if(sendQ->numEntries > 0){ #else for(i=0;i<pxshmContext->nodesize;i++) { if (i == pxshmContext->noderank) continue; PxshmSendQ *sendQ = pxshmContext->sendQs[i]; if(sendQ->numEntries > 0) { #endif #if PXSHM_OSSPINLOCK if(OSSpinLockTry(&pxshmContext->sendBufs[i].header->lock)){ #elif PXSHM_LOCK if(sem_trywait(pxshmContext->sendBufs[i].mutex) >= 0){ #elif PXSHM_FENCE pxshmContext->sendBufs[i].header->flagSender = 1; pxshmContext->sendBufs[i].header->turn = RECEIVER; CmiMemoryReadFence(0,0); CmiMemoryWriteFence(0,0); if(!(pxshmContext->sendBufs[i].header->flagReceiver && pxshmContext->sendBufs[i].header->turn == RECEIVER)){ #endif MACHSTATE1(3,"flushSendQ %d",i); flushSendQ(sendQ); #if PXSHM_OSSPINLOCK OSSpinLockUnlock(&pxshmContext->sendBufs[i].header->lock); #elif PXSHM_LOCK sem_post(pxshmContext->sendBufs[i].mutex); #elif PXSHM_FENCE CmiMemoryReadFence(0,0); CmiMemoryWriteFence(0,0); pxshmContext->sendBufs[i].header->flagSender = 0; #endif }else{ #if PXSHM_FENCE pxshmContext->sendBufs[i].header->flagSender = 0; #endif } } #if SENDQ_LIST if (sendQ->numEntries == 0) { if (index_prev != -1) pxshmContext->sendQs[index_prev]->next = sendQ->next; else sendQ_head_index = sendQ->next; i = sendQ->next; sendQ->next = -2; } else { index_prev = i; i = sendQ->next; } #endif } }; void emptyRecvBuf(sharedBufData *recvBuf){ int numMessages = recvBuf->header->count; int i=0; char *ptr=recvBuf->data; for(i=0;i<numMessages;i++){ int size; int rank, srcpe, seqno, magic, i; unsigned int broot; char *msg = ptr; char *newMsg; size = CMI_MSG_SIZE(msg); newMsg = (char *)CmiAlloc(size); memcpy(newMsg,msg,size); handleOneRecvedMsg(size, newMsg); ptr += size; MACHSTATE3(3,"message of size %d recvd ends at ptr-data %d total bytes %d bytes %d",size,ptr-recvBuf->data,recvBuf->header->bytes); } #if 1 if(ptr - recvBuf->data != recvBuf->header->bytes){ CmiPrintf("[%d] ptr - recvBuf->data %d recvBuf->header->bytes %d numMessages %d \n",_Cmi_mynode, ptr - recvBuf->data, recvBuf->header->bytes,numMessages); } #endif CmiAssert(ptr - recvBuf->data == recvBuf->header->bytes); recvBuf->header->count=0; recvBuf->header->bytes=0; } /************************** *sendQ helper functions * ****************/ void initSendQ(PxshmSendQ *q,int size, int rank){ q->data = (OutgoingMsgRec *)calloc(size, sizeof(OutgoingMsgRec)); q->size = size; q->numEntries = 0; q->begin = 0; q->end = 0; q->rank = rank; #if SENDQ_LIST q->next = -2; #endif } void pushSendQ(PxshmSendQ *q, char *msg, int size, int *refcount){ if(q->numEntries == q->size){ //need to resize OutgoingMsgRec *oldData = q->data; int newSize = q->size<<1; q->data = (OutgoingMsgRec *)calloc(newSize, sizeof(OutgoingMsgRec)); //copy head to the beginning of the new array CmiAssert(q->begin == q->end); CmiAssert(q->begin < q->size); memcpy(&(q->data[0]),&(oldData[q->begin]),sizeof(OutgoingMsgRec)*(q->size - q->begin)); if(q->end!=0){ memcpy(&(q->data[(q->size - q->begin)]),&(oldData[0]),sizeof(OutgoingMsgRec)*(q->end)); } free(oldData); q->begin = 0; q->end = q->size; q->size = newSize; } OutgoingMsgRec *omg = &q->data[q->end]; omg->size = size; omg->data = msg; omg->refcount = refcount; (q->end)++; if(q->end >= q->size){ q->end -= q->size; } q->numEntries++; } OutgoingMsgRec * popSendQ(PxshmSendQ *q){ OutgoingMsgRec * ret; if(0 == q->numEntries){ return NULL; } ret = &q->data[q->begin]; (q->begin)++; if(q->begin >= q->size){ q->begin -= q->size; } q->numEntries--; return ret; }
/** * Returns 1 if this "msg" is an out-of-order message, or * this "msg" is a late message which triggers the process * of all buffered ooo msgs. * --Chao Mei */ static int checkMsgInOrder(char *msg, MsgOrderInfo *info) { int srcpe, destrank; int incomingSeqNo, expectedSeqNo; int curOffset, maxOffset; int i, curWinSize; void **destMsgBuffer = NULL; /* numMsg is the number of msgs to be processed in this buffer*/ /* Reason to have this extra copy of msgs to be processed: Reduce the atomic granularity */ void **toProcessMsgBuffer; int numMsgs = 0; srcpe = CMI_MSG_SRCPE(msg); destrank = CMI_DEST_RANK(msg); incomingSeqNo = CMI_MSG_SEQNO(msg); CmiLock(cmplHdlrThdLock); expectedSeqNo = getNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe); if (expectedSeqNo == incomingSeqNo) { /* Two cases: has ooo msg buffered or not */ maxOffset = (info->oooMaxOffset)[srcpe]; if (maxOffset>0) { MACHSTATE1(4, "Processing all buffered ooo msgs (maxOffset=%d) including the just recved begin {", maxOffset); curWinSize = info->CUR_WINDOW_SIZE[srcpe]; toProcessMsgBuffer = malloc((curWinSize+1)*sizeof(void *)); /* process the msg just recved */ toProcessMsgBuffer[numMsgs++] = msg; /* process the buffered ooo msg until the first empty slot in the window */ destMsgBuffer = (info->oooMsgBuffer)[srcpe]; for (curOffset=0; curOffset<maxOffset; curOffset++) { char *curMsg = destMsgBuffer[curOffset]; if (curMsg == NULL) { CmiAssert(curOffset!=(maxOffset-1)); break; } toProcessMsgBuffer[numMsgs++] = curMsg; destMsgBuffer[curOffset] = NULL; } /* Update expected seqno, maxOffset and slide the window */ if (curOffset < maxOffset) { int i; /** * now, the seqno of the next to-be-recved msg should be * "expectedSeqNo+curOffset+1" as the seqno of the just * processed msg is "expectedSeqNo+curOffset. We need to slide * the msg buffer window from "curOffset+1" because the first * element of the buffer window should always points to the ooo * msg that's 1 in terms of seqno ahead of the next to-be-recved * msg. --Chao Mei */ /* moving [curOffset+1, maxOffset) to [0, maxOffset-curOffset-1) in the window */ /* The following two loops could be combined --Chao Mei */ for (i=0; i<maxOffset-curOffset-1; i++) { destMsgBuffer[i] = destMsgBuffer[curOffset+i+1]; } for (i=maxOffset-curOffset-1; i<maxOffset; i++) { destMsgBuffer[i] = NULL; } (info->oooMaxOffset)[srcpe] = maxOffset-curOffset-1; setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo+curOffset); } else { /* there's no remaining buffered ooo msgs */ (info->oooMaxOffset)[srcpe] = 0; setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo+maxOffset); } CmiUnlock(cmplHdlrThdLock); /* Process the msgs */ for (i=0; i<numMsgs; i++) { char *curMsg = toProcessMsgBuffer[i]; if (CMI_BROADCAST_ROOT(curMsg)>0) { #if CMK_OFFLOAD_BCAST_PROCESS PCQueuePush(CsvAccess(procBcastQ), curMsg); #else processProcBcastMsg(CMI_MSG_SIZE(curMsg), curMsg); #endif } else { CmiPushPE(CMI_DEST_RANK(curMsg), curMsg); } } free(toProcessMsgBuffer); MACHSTATE1(4, "Processing all buffered ooo msgs (actually processed %d) end }", curOffset); /** * Since we have processed all buffered ooo msgs including * this just recved one, 1 should be returned so that this * msg no longer needs processing */ return 1; } else { /* An expected msg recved without any ooo msg buffered */ MACHSTATE1(4, "Receiving an expected msg with seqno=%d\n", incomingSeqNo); setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo); CmiUnlock(cmplHdlrThdLock); return 0; } } MACHSTATE2(4, "Receiving an out-of-order msg with seqno=%d, but expect seqno=%d", incomingSeqNo, expectedSeqNo); curWinSize = info->CUR_WINDOW_SIZE[srcpe]; if ((info->oooMsgBuffer)[srcpe]==NULL) { (info->oooMsgBuffer)[srcpe] = malloc(curWinSize*sizeof(void *)); memset((info->oooMsgBuffer)[srcpe], 0, curWinSize*sizeof(void *)); } destMsgBuffer = (info->oooMsgBuffer)[srcpe]; curOffset = incomingSeqNo - expectedSeqNo; maxOffset = (info->oooMaxOffset)[srcpe]; if (curOffset<0) { /* It's possible that the seqNo starts with another round (exceeding MAX_MSG_SEQNO) with 1 */ curOffset += MAX_MSG_SEQNO; } if (curOffset > curWinSize) { int newWinSize; if (curOffset > MAX_WINDOW_SIZE) { CmiAbort("Exceeding the MAX_WINDOW_SIZE!\n"); } newWinSize = ((curOffset/curWinSize)+1)*curWinSize; /*CmiPrintf("[%d]: WARNING: INCREASING WINDOW SIZE FROM %d TO %d\n", CmiMyPe(), curWinSize, newWinSize);*/ (info->oooMsgBuffer)[srcpe] = malloc(newWinSize*sizeof(void *)); memset((info->oooMsgBuffer)[srcpe], 0, newWinSize*sizeof(void *)); memcpy((info->oooMsgBuffer)[srcpe], destMsgBuffer, curWinSize*sizeof(void *)); info->CUR_WINDOW_SIZE[srcpe] = newWinSize; free(destMsgBuffer); destMsgBuffer = (info->oooMsgBuffer)[srcpe]; } CmiAssert(destMsgBuffer[curOffset-1] == NULL); destMsgBuffer[curOffset-1] = msg; if (curOffset > maxOffset) (info->oooMaxOffset)[srcpe] = curOffset; CmiUnlock(cmplHdlrThdLock); return 1; }