static void SendSpanningChildren(int size, char *msg, int rankToAssign, int startNode) { #if CMK_BROADCAST_SPANNING_TREE int i, oldRank; char *newmsg; oldRank = CMI_DEST_RANK(msg); /* doing this is to avoid the multiple assignment in the following for loop */ CMI_DEST_RANK(msg) = rankToAssign; /* first send msgs to other nodes */ CmiAssert(startNode >=0 && startNode<CmiNumNodes()); for (i=1; i<=BROADCAST_SPANNING_FACTOR; i++) { int nd = CmiMyNode()-startNode; if (nd<0) nd+=CmiNumNodes(); nd = BROADCAST_SPANNING_FACTOR*nd + i; if (nd > CmiNumNodes() - 1) break; nd += startNode; nd = nd%CmiNumNodes(); CmiAssert(nd>=0 && nd!=CmiMyNode()); #if CMK_BROADCAST_USE_CMIREFERENCE CmiReference(msg); CmiSendNetworkFunc(CmiNodeFirst(nd), size, msg, BCAST_SYNC); #else newmsg = CopyMsg(msg, size); CmiSendNetworkFunc(CmiNodeFirst(nd), size, newmsg, BCAST_SYNC); #endif } CMI_DEST_RANK(msg) = oldRank; #endif }
CmiCommHandle CmiAsyncBroadcastFn(int size, char *msg) { #if ENSURE_MSG_PAIRORDER /* Not sure how to add the msg seq no for async broadcast messages --Chao Mei */ /* so abort here ! */ CmiAssert(0); return 0; #else int i, rank; int mype = CmiMyPe(); #if ENABLE_CONVERSE_QD CQdCreate(CpvAccess(cQdState), CmiNumPes()-1); #endif MACHSTATE1(3,"[%d] Sending async broadcast message from {",CmiMyNode()); CMI_BROADCAST_ROOT(msg) = 0; void *handle = malloc(sizeof(int)); *((int *)handle) = CmiNumPes()-1; for (i=mype+1; i<CmiNumPes(); i++) { CMI_DEST_RANK(msg) = CmiRankOf(i); lapiSendFn(CmiNodeOf(i), size, msg, DeliveredMsg, handle); } for (i=0; i<mype; i++) { CMI_DEST_RANK(msg) = CmiRankOf(i); lapiSendFn(CmiNodeOf(i), size, msg, DeliveredMsg, handle); } MACHSTATE(3,"} Sending async broadcast message end"); return handle; #endif }
static CmiCommHandle MachineSendFuncForLAPI(int destNode, int size, char *msg, int mode) { scompl_hndlr_t *shdlr = NULL; void *sinfo = NULL; if (mode==P2P_SYNC) { shdlr = ReleaseMsg; sinfo = (void *)msg; } else if (mode==P2P_ASYNC) { shdlr = DeliveredMsg; sinfo = malloc(sizeof(int)); *((int *)sinfo) = 1; } CMI_MSG_SIZE(msg) = size; #if ENSURE_MSG_PAIRORDER #if CMK_NODE_QUEUE_AVAILABLE if (CMI_DEST_RANK(msg) == DGRAM_NODEMESSAGE) { lapiSendFn(destNode, size, msg, shdlr, sinfo); return sinfo; } #endif int destPE = CmiNodeFirst(destNode)+CMI_DEST_RANK(msg); CMI_MSG_SRCPE(msg) = CmiMyPe(); /* Note: This could be executed on comm threads, where CmiMyPe() >= CmiNumPes() */ CMI_MSG_SEQNO(msg) = getNextMsgSeqNo(CpvAccess(p2pMsgSeqInfo).nextMsgSeqNo, destPE); setNextMsgSeqNo(CpvAccess(p2pMsgSeqInfo).nextMsgSeqNo, destPE, CMI_MSG_SEQNO(msg)); #endif lapiSendFn(destNode, size, msg, shdlr, sinfo); return sinfo; }
/* Functions regarding broadcat op that sends to every one else except me */ void CmiSyncBroadcastFn1(int size, char *msg) { int i, mype; CQdCreate(CpvAccess(cQdState), CmiNumPes()-1); /*record the rank to avoid re-sending the msg in spanning tree or hypercube*/ CMI_DEST_RANK(msg) = CmiMyRank(); #if CMK_BROADCAST_SPANNING_TREE CMI_SET_BROADCAST_ROOT(msg, CmiMyNode()+1); SendSpanningChildrenProc(size, msg); #elif CMK_BROADCAST_HYPERCUBE CMI_SET_BROADCAST_ROOT(msg, CmiMyNode()+1); SendHyperCubeProc(size, msg); #else mype = CmiMyPe(); #if CMK_SMP /* In SMP, this function may be called from comm thread with a larger pe */ if(mype >= _Cmi_numpes){ for(i=0; i<_Cmi_numpes; i++) CmiSyncSendFn(i, size, msg); return; } #endif for ( i=mype+1; i<_Cmi_numpes; i++ ) CmiSyncSendFn(i, size, msg) ; for ( i=0; i<mype; i++ ) CmiSyncSendFn(i, size, msg) ; #endif /*CmiPrintf("In SyncBroadcast broadcast\n");*/ }
static INLINE_KEYWORD void lapiSendFn(int destNode, int size, char *msg, scompl_hndlr_t *shdlr, void *sinfo) { lapi_xfer_t xfer_cmd; MACHSTATE3(2,"lapiSendFn to destNode=%d with msg %p (isImm=%d) begin {",destNode,msg, CmiIsImmediate(msg)); MACHSTATE3(2, "inside lapiSendFn 1: size=%d, sinfo=%p, deliverable=%d", size, sinfo, deliverable); MACHSTATE2(2, "Ready to call LAPI_Xfer with destNode=%d, destRank=%d",destNode,CMI_DEST_RANK(msg)); xfer_cmd.Am.Xfer_type = LAPI_AM_XFER; xfer_cmd.Am.flags = 0; xfer_cmd.Am.tgt = destNode; xfer_cmd.Am.hdr_hdl = lapiHeaderHandler; xfer_cmd.Am.uhdr_len = 0; xfer_cmd.Am.uhdr = NULL; xfer_cmd.Am.udata = msg; xfer_cmd.Am.udata_len = size; xfer_cmd.Am.shdlr = shdlr; xfer_cmd.Am.sinfo = sinfo; xfer_cmd.Am.tgt_cntr = NULL; xfer_cmd.Am.org_cntr = NULL; xfer_cmd.Am.cmpl_cntr = NULL; check_lapi(LAPI_Xfer,(lapiContext, &xfer_cmd)); MACHSTATE(2,"} lapiSendFn end"); }
static void SendHyperCube(int size, char *msg, int rankToAssign, int startNode) { #if CMK_BROADCAST_HYPERCUBE int i, cnt, tmp, relDist, oldRank; const int dims=CmiNodesDim; oldRank = CMI_DEST_RANK(msg); /* doing this is to avoid the multiple assignment in the following for loop */ CMI_DEST_RANK(msg) = rankToAssign; /* first send msgs to other nodes */ relDist = CmiMyNode()-startNode; if (relDist < 0) relDist += CmiNumNodes(); /* Sending scheme example: say we have 9 nodes, and the msg is sent from 0 * The overall sending steps will be as follows: * 0-->8, 0-->4, 0-->2, 0-->1 * 4-->6, 4-->5 * 2-->3 * 6-->7 * So for node id as N=A+2^B, it will forward the broadcast (B-1) msg to in * the order as: N+2^(B-1), N+2^(B-2),..., N+1 except node 0, where B is * the first position of bit 1 in the binary format of the number of N * counting from the right with count starting from 0. * On node 0, the value "B" should be CmiNodesDim */ /* Calculate 2^B */ if(relDist==0) cnt = 1<<dims; else cnt = relDist & ((~relDist)+1); /*CmiPrintf("ND[%d]: send bcast msg with cnt=%d\n", CmiMyNode(), cnt);*/ /* Begin to send msgs */ for(cnt>>=1; cnt>0; cnt>>=1){ int nd = relDist + cnt; if (nd >= CmiNumNodes()) continue; nd = (nd+startNode)%CmiNumNodes(); /*CmiPrintf("ND[%d]: send to node %d\n", CmiMyNode(), nd);*/ CmiAssert(nd>=0 && nd!=CmiMyNode()); #if CMK_BROADCAST_USE_CMIREFERENCE CmiReference(msg); CmiSendNetworkFunc(CmiNodeFirst(nd), size, msg, BCAST_SYNC); #else char *newmsg = CopyMsg(msg, size); CmiSendNetworkFunc(CmiNodeFirst(nd), size, newmsg, BCAST_SYNC); #endif } CMI_DEST_RANK(msg) = oldRank; #endif }
/** * lapi completion handler on the recv side. It's responsible to push messages * to the destination proc or relay broadcast messages. --Chao Mei * * Note: The completion handler could be executed on any cores within a node ??? * So in SMP mode when there's a comm thread, the completion handler should be carefully * dealt with. * * Given lapi also provides an internal lapi thread to deal with network progress which * will call this function (???), we should be careful with the following situations: * 1) non SMP mode, with interrupt (lapi internal completion thread) * 2) non SMP mode, with polling (machine layer is responsible for network progress) * 3) SMP mode, no comm thread, with polling * 4) SMP mode, no comm thread, with interrupt * 5) SMP mode, with comm thread, with polling (not yet implemented, comm server is empty right now) * 6) SMP mode, with comm thread, with interrupt?? * * Currently, SMP mode without comm thread is undergoing implementation. * * This function is executed by LAPI internal threads. It seems that the number of internal * completion handler threads could vary during the program. LAPI adaptively creates more * threads if there are more outstanding messages!!!! This means pcqueue needs protection * even in the nonsmp case!!!! * * --Chao Mei */ static void PumpMsgsComplete(lapi_handle_t *myLapiContext, void *am_info) { int i; char *msg = am_info; int broot, destrank; MACHSTATE3(2,"[%d] PumpMsgsComplete with msg %p (isImm=%d) begin {",CmiMyNode(), msg, CmiIsImmediate(msg)); #if ENSURE_MSG_PAIRORDER MACHSTATE3(2,"msg %p info: srcpe=%d, seqno=%d", msg, CMI_MSG_SRCPE(msg), CMI_MSG_SEQNO(msg)); #endif /** * First, we check if the msg is a broadcast msg via spanning * tree. If it is, it needs to call SendSpanningChildren to * relay the broadcast, and then send the msg to every cores on * this node. * * After the first check, we deal with normal messages. * --Chao Mei */ /* It's the right place to relay the broadcast message */ /** * 1. For in-order delivery, because this is the handler for * receiving a message, and we assume the cross-network msgs are * always delivered to the first proc (rank 0) of this node, we * select the srcpe of the bcast msgs and the next msg seq no * correspondingly. * * --Chao Mei */ #if ENSURE_MSG_PAIRORDER broot = CMI_BROADCAST_ROOT(msg); destrank = CMI_DEST_RANK(msg); /* Only check proc-level msgs */ if (broot>=0 #if CMK_NODE_QUEUE_AVAILABLE && destrank != DGRAM_NODEMESSAGE #endif ) { MsgOrderInfo *info; info = &CpvAccessOther(p2pMsgSeqInfo, destrank); MACHSTATE1(2, "Check msg in-order for p2p msg %p", msg); if (checkMsgInOrder(msg,info)) { MACHSTATE(2,"} PumpMsgsComplete end "); return; } } #endif handleOneRecvedMsg(CMI_MSG_SIZE(msg), msg); MACHSTATE(2,"} PumpMsgsComplete end "); return; }
CmiCommHandle CmiAsyncNodeBroadcastFn(int size, char *msg) { int i; #if ENABLE_CONVERSE_QD CQdCreate(CpvAccess(cQdState), CmiNumNodes()-1); #endif MACHSTATE1(3,"[%d] Sending async node broadcast message from {",CmiMyNode()); CMI_BROADCAST_ROOT(msg) = 0; CMI_DEST_RANK(msg) =DGRAM_NODEMESSAGE; void *handle = malloc(sizeof(int)); *((int *)handle) = CmiNumNodes()-1; for (i=CmiMyNode()+1; i<CmiNumNodes(); i++) { lapiSendFn(i, size, msg, DeliveredMsg, handle); } for (i=0; i<CmiMyNode(); i++) { lapiSendFn(i, size, msg, DeliveredMsg, handle); } MACHSTATE(3,"} Sending async broadcast message end"); return handle; }
static INLINE_KEYWORD void processProcBcastMsg(int size, char *msg) { /* Since this function is only called on intermediate nodes, * the rank of this msg should be 0. */ CmiAssert(CMI_DEST_RANK(msg)==0); /*CmiPushPE(CMI_DEST_RANK(msg), msg);*/ #if CMK_BROADCAST_SPANNING_TREE SendSpanningChildrenProc(size, msg); #elif CMK_BROADCAST_HYPERCUBE SendHyperCubeProc(size, msg); #endif #if CMK_BROADCAST_SPANNING_TREE && CMK_BROADCAST_USE_CMIREFERENCE /* same message may be sent out, make a copy of it */ if (CmiNumNodes()>1 && CmiGetReference(msg)>1) { void *newmsg; newmsg = CopyMsg(msg, size); CmiFree(msg); msg = newmsg; } #endif CmiPushPE(0, msg); }
/** * Returns 1 if this "msg" is an out-of-order message, or * this "msg" is a late message which triggers the process * of all buffered ooo msgs. * --Chao Mei */ static int checkMsgInOrder(char *msg, MsgOrderInfo *info) { int srcpe, destrank; int incomingSeqNo, expectedSeqNo; int curOffset, maxOffset; int i, curWinSize; void **destMsgBuffer = NULL; /* numMsg is the number of msgs to be processed in this buffer*/ /* Reason to have this extra copy of msgs to be processed: Reduce the atomic granularity */ void **toProcessMsgBuffer; int numMsgs = 0; srcpe = CMI_MSG_SRCPE(msg); destrank = CMI_DEST_RANK(msg); incomingSeqNo = CMI_MSG_SEQNO(msg); CmiLock(cmplHdlrThdLock); expectedSeqNo = getNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe); if (expectedSeqNo == incomingSeqNo) { /* Two cases: has ooo msg buffered or not */ maxOffset = (info->oooMaxOffset)[srcpe]; if (maxOffset>0) { MACHSTATE1(4, "Processing all buffered ooo msgs (maxOffset=%d) including the just recved begin {", maxOffset); curWinSize = info->CUR_WINDOW_SIZE[srcpe]; toProcessMsgBuffer = malloc((curWinSize+1)*sizeof(void *)); /* process the msg just recved */ toProcessMsgBuffer[numMsgs++] = msg; /* process the buffered ooo msg until the first empty slot in the window */ destMsgBuffer = (info->oooMsgBuffer)[srcpe]; for (curOffset=0; curOffset<maxOffset; curOffset++) { char *curMsg = destMsgBuffer[curOffset]; if (curMsg == NULL) { CmiAssert(curOffset!=(maxOffset-1)); break; } toProcessMsgBuffer[numMsgs++] = curMsg; destMsgBuffer[curOffset] = NULL; } /* Update expected seqno, maxOffset and slide the window */ if (curOffset < maxOffset) { int i; /** * now, the seqno of the next to-be-recved msg should be * "expectedSeqNo+curOffset+1" as the seqno of the just * processed msg is "expectedSeqNo+curOffset. We need to slide * the msg buffer window from "curOffset+1" because the first * element of the buffer window should always points to the ooo * msg that's 1 in terms of seqno ahead of the next to-be-recved * msg. --Chao Mei */ /* moving [curOffset+1, maxOffset) to [0, maxOffset-curOffset-1) in the window */ /* The following two loops could be combined --Chao Mei */ for (i=0; i<maxOffset-curOffset-1; i++) { destMsgBuffer[i] = destMsgBuffer[curOffset+i+1]; } for (i=maxOffset-curOffset-1; i<maxOffset; i++) { destMsgBuffer[i] = NULL; } (info->oooMaxOffset)[srcpe] = maxOffset-curOffset-1; setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo+curOffset); } else { /* there's no remaining buffered ooo msgs */ (info->oooMaxOffset)[srcpe] = 0; setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo+maxOffset); } CmiUnlock(cmplHdlrThdLock); /* Process the msgs */ for (i=0; i<numMsgs; i++) { char *curMsg = toProcessMsgBuffer[i]; if (CMI_BROADCAST_ROOT(curMsg)>0) { #if CMK_OFFLOAD_BCAST_PROCESS PCQueuePush(CsvAccess(procBcastQ), curMsg); #else processProcBcastMsg(CMI_MSG_SIZE(curMsg), curMsg); #endif } else { CmiPushPE(CMI_DEST_RANK(curMsg), curMsg); } } free(toProcessMsgBuffer); MACHSTATE1(4, "Processing all buffered ooo msgs (actually processed %d) end }", curOffset); /** * Since we have processed all buffered ooo msgs including * this just recved one, 1 should be returned so that this * msg no longer needs processing */ return 1; } else { /* An expected msg recved without any ooo msg buffered */ MACHSTATE1(4, "Receiving an expected msg with seqno=%d\n", incomingSeqNo); setNextExpectedMsgSeqNo(info->expectedMsgSeqNo, srcpe, expectedSeqNo); CmiUnlock(cmplHdlrThdLock); return 0; } } MACHSTATE2(4, "Receiving an out-of-order msg with seqno=%d, but expect seqno=%d", incomingSeqNo, expectedSeqNo); curWinSize = info->CUR_WINDOW_SIZE[srcpe]; if ((info->oooMsgBuffer)[srcpe]==NULL) { (info->oooMsgBuffer)[srcpe] = malloc(curWinSize*sizeof(void *)); memset((info->oooMsgBuffer)[srcpe], 0, curWinSize*sizeof(void *)); } destMsgBuffer = (info->oooMsgBuffer)[srcpe]; curOffset = incomingSeqNo - expectedSeqNo; maxOffset = (info->oooMaxOffset)[srcpe]; if (curOffset<0) { /* It's possible that the seqNo starts with another round (exceeding MAX_MSG_SEQNO) with 1 */ curOffset += MAX_MSG_SEQNO; } if (curOffset > curWinSize) { int newWinSize; if (curOffset > MAX_WINDOW_SIZE) { CmiAbort("Exceeding the MAX_WINDOW_SIZE!\n"); } newWinSize = ((curOffset/curWinSize)+1)*curWinSize; /*CmiPrintf("[%d]: WARNING: INCREASING WINDOW SIZE FROM %d TO %d\n", CmiMyPe(), curWinSize, newWinSize);*/ (info->oooMsgBuffer)[srcpe] = malloc(newWinSize*sizeof(void *)); memset((info->oooMsgBuffer)[srcpe], 0, newWinSize*sizeof(void *)); memcpy((info->oooMsgBuffer)[srcpe], destMsgBuffer, curWinSize*sizeof(void *)); info->CUR_WINDOW_SIZE[srcpe] = newWinSize; free(destMsgBuffer); destMsgBuffer = (info->oooMsgBuffer)[srcpe]; } CmiAssert(destMsgBuffer[curOffset-1] == NULL); destMsgBuffer[curOffset-1] = msg; if (curOffset > maxOffset) (info->oooMaxOffset)[srcpe] = curOffset; CmiUnlock(cmplHdlrThdLock); return 1; }