CmiCommHandle CmiAsyncBroadcastFn(int size, char *msg) { #if ENSURE_MSG_PAIRORDER /* Not sure how to add the msg seq no for async broadcast messages --Chao Mei */ /* so abort here ! */ CmiAssert(0); return 0; #else int i, rank; int mype = CmiMyPe(); #if ENABLE_CONVERSE_QD CQdCreate(CpvAccess(cQdState), CmiNumPes()-1); #endif MACHSTATE1(3,"[%d] Sending async broadcast message from {",CmiMyNode()); CMI_BROADCAST_ROOT(msg) = 0; void *handle = malloc(sizeof(int)); *((int *)handle) = CmiNumPes()-1; for (i=mype+1; i<CmiNumPes(); i++) { CMI_DEST_RANK(msg) = CmiRankOf(i); lapiSendFn(CmiNodeOf(i), size, msg, DeliveredMsg, handle); } for (i=0; i<mype; i++) { CMI_DEST_RANK(msg) = CmiRankOf(i); lapiSendFn(CmiNodeOf(i), size, msg, DeliveredMsg, handle); } MACHSTATE(3,"} Sending async broadcast message end"); return handle; #endif }
static void SendSpanningChildrenProc(int size, char *msg) { int startnode = CMI_BROADCAST_ROOT(msg)-1; SendSpanningChildren(size, msg, 0, startnode); #if CMK_SMP /* second send msgs to my peers on this node */ SendToPeers(size, msg); #endif }
static void handleOneBcastMsg(int size, char *msg) { CmiAssert(CMI_BROADCAST_ROOT(msg)!=0); #if CMK_OFFLOAD_BCAST_PROCESS if (CMI_BROADCAST_ROOT(msg)>0) { CMIQueuePush(CsvAccess(procBcastQ), msg); } else { #if CMK_NODE_QUEUE_AVAILABLE CMIQueuePush(CsvAccess(nodeBcastQ), msg); #endif } #else if (CMI_BROADCAST_ROOT(msg)>0) { processProcBcastMsg(size, msg); } else { #if CMK_NODE_QUEUE_AVAILABLE processNodeBcastMsg(size, msg); #endif } #endif }
/* send msg along the hypercube in broadcast. (Sameer) */ static void SendHyperCubeProc(int size, char *msg) { int startpe = CMI_BROADCAST_ROOT(msg)-1; int startnode = CmiNodeOf(startpe); #if CMK_SMP if (startpe > CmiNumPes()) startnode = startpe - CmiNumPes(); #endif SendHyperCube(size, msg, 0, startnode); #if CMK_SMP /* second send msgs to my peers on this node */ SendToPeers(size, msg); #endif }
/** * lapi completion handler on the recv side. It's responsible to push messages * to the destination proc or relay broadcast messages. --Chao Mei * * Note: The completion handler could be executed on any cores within a node ??? * So in SMP mode when there's a comm thread, the completion handler should be carefully * dealt with. * * Given lapi also provides an internal lapi thread to deal with network progress which * will call this function (???), we should be careful with the following situations: * 1) non SMP mode, with interrupt (lapi internal completion thread) * 2) non SMP mode, with polling (machine layer is responsible for network progress) * 3) SMP mode, no comm thread, with polling * 4) SMP mode, no comm thread, with interrupt * 5) SMP mode, with comm thread, with polling (not yet implemented, comm server is empty right now) * 6) SMP mode, with comm thread, with interrupt?? * * Currently, SMP mode without comm thread is undergoing implementation. * * This function is executed by LAPI internal threads. It seems that the number of internal * completion handler threads could vary during the program. LAPI adaptively creates more * threads if there are more outstanding messages!!!! This means pcqueue needs protection * even in the nonsmp case!!!! * * --Chao Mei */ static void PumpMsgsComplete(lapi_handle_t *myLapiContext, void *am_info) { int i; char *msg = am_info; int broot, destrank; MACHSTATE3(2,"[%d] PumpMsgsComplete with msg %p (isImm=%d) begin {",CmiMyNode(), msg, CmiIsImmediate(msg)); #if ENSURE_MSG_PAIRORDER MACHSTATE3(2,"msg %p info: srcpe=%d, seqno=%d", msg, CMI_MSG_SRCPE(msg), CMI_MSG_SEQNO(msg)); #endif /** * First, we check if the msg is a broadcast msg via spanning * tree. If it is, it needs to call SendSpanningChildren to * relay the broadcast, and then send the msg to every cores on * this node. * * After the first check, we deal with normal messages. * --Chao Mei */ /* It's the right place to relay the broadcast message */ /** * 1. For in-order delivery, because this is the handler for * receiving a message, and we assume the cross-network msgs are * always delivered to the first proc (rank 0) of this node, we * select the srcpe of the bcast msgs and the next msg seq no * correspondingly. * * --Chao Mei */ #if ENSURE_MSG_PAIRORDER broot = CMI_BROADCAST_ROOT(msg); destrank = CMI_DEST_RANK(msg); /* Only check proc-level msgs */ if (broot>=0 #if CMK_NODE_QUEUE_AVAILABLE && destrank != DGRAM_NODEMESSAGE #endif ) { MsgOrderInfo *info; info = &CpvAccessOther(p2pMsgSeqInfo, destrank); MACHSTATE1(2, "Check msg in-order for p2p msg %p", msg); if (checkMsgInOrder(msg,info)) { MACHSTATE(2,"} PumpMsgsComplete end "); return; } } #endif handleOneRecvedMsg(CMI_MSG_SIZE(msg), msg); MACHSTATE(2,"} PumpMsgsComplete end "); return; }
CmiCommHandle CmiAsyncNodeBroadcastFn(int size, char *msg) { int i; #if ENABLE_CONVERSE_QD CQdCreate(CpvAccess(cQdState), CmiNumNodes()-1); #endif MACHSTATE1(3,"[%d] Sending async node broadcast message from {",CmiMyNode()); CMI_BROADCAST_ROOT(msg) = 0; CMI_DEST_RANK(msg) =DGRAM_NODEMESSAGE; void *handle = malloc(sizeof(int)); *((int *)handle) = CmiNumNodes()-1; for (i=CmiMyNode()+1; i<CmiNumNodes(); i++) { lapiSendFn(i, size, msg, DeliveredMsg, handle); } for (i=0; i<CmiMyNode(); i++) { lapiSendFn(i, size, msg, DeliveredMsg, handle); } MACHSTATE(3,"} Sending async broadcast message end"); return handle; }
/* called in PumpMsgs */ int PumpPersistent() { PersistentReceivesTable *slot = persistentReceivesTableHead; int status = 0; while (slot) { unsigned int size = *(slot->recvSizePtr[0]); if (size > 0) { char *msg = slot->messagePtr[0]; /*CmiPrintf("[%d] size: %d rank:%d msg:%p %p\n", CmiMyPe(), size, CMI_DEST_RANK(msg), msg, slot->messagePtr);*/ #if 0 void *dupmsg; dupmsg = CmiAlloc(size); _MEMCHECK(dupmsg); memcpy(dupmsg, msg, size); msg = dupmsg; #else /* return messagePtr directly and user MUST make sure not to delete it. */ /*CmiPrintf("[%d] %p size:%d rank:%d root:%d\n", CmiMyPe(), msg, size, CMI_DEST_RANK(msg), CMI_BROADCAST_ROOT(msg));*/ CmiReference(msg); #endif CmiPushPE(CMI_DEST_RANK_NET(msg), msg); #if CMK_BROADCAST_SPANNING_TREE if (CMI_BROADCAST_ROOT(msg)) SendSpanningChildrenNet(size, msg); #endif *(slot->recvSizePtr[0]) = 0; status = 1; } slot = slot->next; } return status; }
static void SendHyperCubeNode(int size, char *msg) { int startnode = -CMI_BROADCAST_ROOT(msg)-1; SendHyperCube(size, msg, DGRAM_NODEMESSAGE, startnode); }
static void SendSpanningChildrenNode(int size, char *msg) { int startnode = -CMI_BROADCAST_ROOT(msg)-1; SendSpanningChildren(size, msg, DGRAM_NODEMESSAGE, startnode); }
/** * Returns 1 if this "msg" is an out-of-order message, or * this "msg" is a late message which triggers the process * of all buffered ooo msgs. * --Chao Mei */ static int checkMsgInOrder(char *msg, MsgOrderInfo *info) { int srcpe, destrank; int incomingSeqNo, expectedSeqNo; int curOffset, maxOffset; int i, curWinSize; void **destMsgBuffer = NULL; /* numMsg is the number of msgs to be processed in this buffer*/ /* Reason to have this extra copy of msgs to be processed: Reduce the atomic granularity */ void **toProcessMsgBuffer; int numMsgs = 0; srcpe = CMI_MSG_SRCPE(msg); destrank = CMI_DEST_RANK(msg); incomingSeqNo = CMI_MSG_SEQNO(msg); CmiLock(cmplHdlrThdLock); expectedSeqNo = getNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe); if (expectedSeqNo == incomingSeqNo) { /* Two cases: has ooo msg buffered or not */ maxOffset = (info->oooMaxOffset)[srcpe]; if (maxOffset>0) { MACHSTATE1(4, "Processing all buffered ooo msgs (maxOffset=%d) including the just recved begin {", maxOffset); curWinSize = info->CUR_WINDOW_SIZE[srcpe]; toProcessMsgBuffer = malloc((curWinSize+1)*sizeof(void *)); /* process the msg just recved */ toProcessMsgBuffer[numMsgs++] = msg; /* process the buffered ooo msg until the first empty slot in the window */ destMsgBuffer = (info->oooMsgBuffer)[srcpe]; for (curOffset=0; curOffset<maxOffset; curOffset++) { char *curMsg = destMsgBuffer[curOffset]; if (curMsg == NULL) { CmiAssert(curOffset!=(maxOffset-1)); break; } toProcessMsgBuffer[numMsgs++] = curMsg; destMsgBuffer[curOffset] = NULL; } /* Update expected seqno, maxOffset and slide the window */ if (curOffset < maxOffset) { int i; /** * now, the seqno of the next to-be-recved msg should be * "expectedSeqNo+curOffset+1" as the seqno of the just * processed msg is "expectedSeqNo+curOffset. We need to slide * the msg buffer window from "curOffset+1" because the first * element of the buffer window should always points to the ooo * msg that's 1 in terms of seqno ahead of the next to-be-recved * msg. --Chao Mei */ /* moving [curOffset+1, maxOffset) to [0, maxOffset-curOffset-1) in the window */ /* The following two loops could be combined --Chao Mei */ for (i=0; i<maxOffset-curOffset-1; i++) { destMsgBuffer[i] = destMsgBuffer[curOffset+i+1]; } for (i=maxOffset-curOffset-1; i<maxOffset; i++) { destMsgBuffer[i] = NULL; } (info->oooMaxOffset)[srcpe] = maxOffset-curOffset-1; setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo+curOffset); } else { /* there's no remaining buffered ooo msgs */ (info->oooMaxOffset)[srcpe] = 0; setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo+maxOffset); } CmiUnlock(cmplHdlrThdLock); /* Process the msgs */ for (i=0; i<numMsgs; i++) { char *curMsg = toProcessMsgBuffer[i]; if (CMI_BROADCAST_ROOT(curMsg)>0) { #if CMK_OFFLOAD_BCAST_PROCESS PCQueuePush(CsvAccess(procBcastQ), curMsg); #else processProcBcastMsg(CMI_MSG_SIZE(curMsg), curMsg); #endif } else { CmiPushPE(CMI_DEST_RANK(curMsg), curMsg); } } free(toProcessMsgBuffer); MACHSTATE1(4, "Processing all buffered ooo msgs (actually processed %d) end }", curOffset); /** * Since we have processed all buffered ooo msgs including * this just recved one, 1 should be returned so that this * msg no longer needs processing */ return 1; } else { /* An expected msg recved without any ooo msg buffered */ MACHSTATE1(4, "Receiving an expected msg with seqno=%d\n", incomingSeqNo); setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo); CmiUnlock(cmplHdlrThdLock); return 0; } } MACHSTATE2(4, "Receiving an out-of-order msg with seqno=%d, but expect seqno=%d", incomingSeqNo, expectedSeqNo); curWinSize = info->CUR_WINDOW_SIZE[srcpe]; if ((info->oooMsgBuffer)[srcpe]==NULL) { (info->oooMsgBuffer)[srcpe] = malloc(curWinSize*sizeof(void *)); memset((info->oooMsgBuffer)[srcpe], 0, curWinSize*sizeof(void *)); } destMsgBuffer = (info->oooMsgBuffer)[srcpe]; curOffset = incomingSeqNo - expectedSeqNo; maxOffset = (info->oooMaxOffset)[srcpe]; if (curOffset<0) { /* It's possible that the seqNo starts with another round (exceeding MAX_MSG_SEQNO) with 1 */ curOffset += MAX_MSG_SEQNO; } if (curOffset > curWinSize) { int newWinSize; if (curOffset > MAX_WINDOW_SIZE) { CmiAbort("Exceeding the MAX_WINDOW_SIZE!\n"); } newWinSize = ((curOffset/curWinSize)+1)*curWinSize; /*CmiPrintf("[%d]: WARNING: INCREASING WINDOW SIZE FROM %d TO %d\n", CmiMyPe(), curWinSize, newWinSize);*/ (info->oooMsgBuffer)[srcpe] = malloc(newWinSize*sizeof(void *)); memset((info->oooMsgBuffer)[srcpe], 0, newWinSize*sizeof(void *)); memcpy((info->oooMsgBuffer)[srcpe], destMsgBuffer, curWinSize*sizeof(void *)); info->CUR_WINDOW_SIZE[srcpe] = newWinSize; free(destMsgBuffer); destMsgBuffer = (info->oooMsgBuffer)[srcpe]; } CmiAssert(destMsgBuffer[curOffset-1] == NULL); destMsgBuffer[curOffset-1] = msg; if (curOffset > maxOffset) (info->oooMaxOffset)[srcpe] = curOffset; CmiUnlock(cmplHdlrThdLock); return 1; }