static INLINE_KEYWORD void lapiSendFn(int destNode, int size, char *msg, scompl_hndlr_t *shdlr, void *sinfo) { lapi_xfer_t xfer_cmd; MACHSTATE3(2,"lapiSendFn to destNode=%d with msg %p (isImm=%d) begin {",destNode,msg, CmiIsImmediate(msg)); MACHSTATE3(2, "inside lapiSendFn 1: size=%d, sinfo=%p, deliverable=%d", size, sinfo, deliverable); MACHSTATE2(2, "Ready to call LAPI_Xfer with destNode=%d, destRank=%d",destNode,CMI_DEST_RANK(msg)); xfer_cmd.Am.Xfer_type = LAPI_AM_XFER; xfer_cmd.Am.flags = 0; xfer_cmd.Am.tgt = destNode; xfer_cmd.Am.hdr_hdl = lapiHeaderHandler; xfer_cmd.Am.uhdr_len = 0; xfer_cmd.Am.uhdr = NULL; xfer_cmd.Am.udata = msg; xfer_cmd.Am.udata_len = size; xfer_cmd.Am.shdlr = shdlr; xfer_cmd.Am.sinfo = sinfo; xfer_cmd.Am.tgt_cntr = NULL; xfer_cmd.Am.org_cntr = NULL; xfer_cmd.Am.cmpl_cntr = NULL; check_lapi(LAPI_Xfer,(lapiContext, &xfer_cmd)); MACHSTATE(2,"} lapiSendFn end"); }
/** * lapi completion handler on the recv side. It's responsible to push messages * to the destination proc or relay broadcast messages. --Chao Mei * * Note: The completion handler could be executed on any cores within a node ??? * So in SMP mode when there's a comm thread, the completion handler should be carefully * dealt with. * * Given lapi also provides an internal lapi thread to deal with network progress which * will call this function (???), we should be careful with the following situations: * 1) non SMP mode, with interrupt (lapi internal completion thread) * 2) non SMP mode, with polling (machine layer is responsible for network progress) * 3) SMP mode, no comm thread, with polling * 4) SMP mode, no comm thread, with interrupt * 5) SMP mode, with comm thread, with polling (not yet implemented, comm server is empty right now) * 6) SMP mode, with comm thread, with interrupt?? * * Currently, SMP mode without comm thread is undergoing implementation. * * This function is executed by LAPI internal threads. It seems that the number of internal * completion handler threads could vary during the program. LAPI adaptively creates more * threads if there are more outstanding messages!!!! This means pcqueue needs protection * even in the nonsmp case!!!! * * --Chao Mei */ static void PumpMsgsComplete(lapi_handle_t *myLapiContext, void *am_info) { int i; char *msg = am_info; int broot, destrank; MACHSTATE3(2,"[%d] PumpMsgsComplete with msg %p (isImm=%d) begin {",CmiMyNode(), msg, CmiIsImmediate(msg)); #if ENSURE_MSG_PAIRORDER MACHSTATE3(2,"msg %p info: srcpe=%d, seqno=%d", msg, CMI_MSG_SRCPE(msg), CMI_MSG_SEQNO(msg)); #endif /** * First, we check if the msg is a broadcast msg via spanning * tree. If it is, it needs to call SendSpanningChildren to * relay the broadcast, and then send the msg to every cores on * this node. * * After the first check, we deal with normal messages. * --Chao Mei */ /* It's the right place to relay the broadcast message */ /** * 1. For in-order delivery, because this is the handler for * receiving a message, and we assume the cross-network msgs are * always delivered to the first proc (rank 0) of this node, we * select the srcpe of the bcast msgs and the next msg seq no * correspondingly. * * --Chao Mei */ #if ENSURE_MSG_PAIRORDER broot = CMI_BROADCAST_ROOT(msg); destrank = CMI_DEST_RANK(msg); /* Only check proc-level msgs */ if (broot>=0 #if CMK_NODE_QUEUE_AVAILABLE && destrank != DGRAM_NODEMESSAGE #endif ) { MsgOrderInfo *info; info = &CpvAccessOther(p2pMsgSeqInfo, destrank); MACHSTATE1(2, "Check msg in-order for p2p msg %p", msg); if (checkMsgInOrder(msg,info)) { MACHSTATE(2,"} PumpMsgsComplete end "); return; } } #endif handleOneRecvedMsg(CMI_MSG_SIZE(msg), msg); MACHSTATE(2,"} PumpMsgsComplete end "); return; }
void emptyRecvBuf(sharedBufData *recvBuf){ int numMessages = recvBuf->header->count; int i=0; char *ptr=recvBuf->data; for(i=0;i<numMessages;i++){ int size; int rank, srcpe, seqno, magic, i; unsigned int broot; char *msg = ptr; char *newMsg; DgramHeaderBreak(msg, rank, srcpe, magic, seqno, broot); size = CmiMsgHeaderGetLength(msg); newMsg = (char *)CmiAlloc(size); memcpy(newMsg,msg,size); handoverSysvshmMessage(newMsg,size,rank,broot); ptr += size; MACHSTATE3(3,"message of size %d recvd ends at ptr-data %d total bytes %d bytes %d",size,ptr-recvBuf->data,recvBuf->header->bytes); } CmiAssert(ptr - recvBuf->data == recvBuf->header->bytes); recvBuf->header->count=0; recvBuf->header->bytes=0; }
void CmiSendMessageSysvshm(OutgoingMsg ogm,OtherNode node,int rank,unsigned int broot){ struct sembuf sb; #if SYSVSHM_STATS double _startSendTime = CmiWallTimer(); #endif int dstRank = SysvshmRank(ogm->dst); MEMDEBUG(CmiMemoryCheck()); DgramHeaderMake(ogm->data,rank,ogm->src,Cmi_charmrun_pid,1, broot); MACHSTATE4(3,"Send Msg Sysvshm ogm %p size %d dst %d dstRank %d",ogm,ogm->size,ogm->dst,dstRank); CmiAssert(dstRank >=0 && dstRank != sysvshmContext->noderank); sharedBufData *dstBuf = &(sysvshmContext->sendBufs[dstRank]); ACQUIRENW(sysvshmContext->noderank); if(semop(dstBuf->semid, &sb, 1)<0) { /**failed to get the lock insert into q and retain the message*/ pushSendQ(sysvshmContext->sendQs[dstRank],ogm); ogm->refcount++; MEMDEBUG(CmiMemoryCheck()); return; }else{ /*** * We got the lock for this buffer * first write all the messages in the sendQ and then write this guy * */ if(sysvshmContext->sendQs[dstRank]->numEntries == 0){ /* send message user event */ int ret = sendMessage(ogm,dstBuf,sysvshmContext->sendQs[dstRank]); MACHSTATE(3,"Sysvshm Send succeeded immediately"); }else{ ogm->refcount+=2;/*this message should not get deleted when the queue is flushed*/ pushSendQ(sysvshmContext->sendQs[dstRank],ogm); MACHSTATE3(3,"Sysvshm ogm %p pushed to sendQ length %d refcount %d",ogm,sysvshmContext->sendQs[dstRank]->numEntries,ogm->refcount); int sent = flushSendQ(dstRank); ogm->refcount--; /*if it has been sent, can be deleted by caller, if not will be deleted when queue is flushed*/ MACHSTATE1(3,"Sysvshm flushSendQ sent %d messages",sent); } /* unlock the recvbuffer*/ RELEASE(sysvshmContext->noderank); CmiAssert(semop(dstBuf->semid, &sb, 1)>=0); } #if SYSVSHM_STATS sysvshmContext->sendCount ++; sysvshmContext->sendTime += (CmiWallTimer()-_startSendTime); #endif MEMDEBUG(CmiMemoryCheck()); };
/**************** *copy this message into the sharedBuf If it does not succeed *put it into the sendQ *NOTE: This method is called only after obtaining the corresponding mutex * ********/ int sendMessage(OutgoingMsg ogm,sharedBufData *dstBuf,SysvshmSendQ *dstSendQ){ if(dstBuf->header->bytes+ogm->size <= SHMBUFLEN){ /**copy this message to sharedBuf **/ dstBuf->header->count++; memcpy(dstBuf->data+dstBuf->header->bytes,ogm->data,ogm->size); dstBuf->header->bytes += ogm->size; MACHSTATE4(3,"Sysvshm send done ogm %p size %d dstBuf->header->count %d dstBuf->header->bytes %d",ogm,ogm->size,dstBuf->header->count,dstBuf->header->bytes); return 1; } /*** * Shared Buffer is too full for this message * **/ printf("send buffer is too full\n"); pushSendQ(dstSendQ,ogm); ogm->refcount++; MACHSTATE3(3,"Sysvshm send ogm %p size %d queued refcount %d",ogm,ogm->size,ogm->refcount); return 0; }
void calculateNodeSizeAndRank(char **argv){ sysvshmContext->nodesize=1; MACHSTATE(3,"calculateNodeSizeAndRank start"); CmiGetArgIntDesc(argv, "+nodesize", &(sysvshmContext->nodesize),"Number of cores in this node"); MACHSTATE1(3,"calculateNodeSizeAndRank argintdesc %d",sysvshmContext->nodesize); sysvshmContext->noderank = _Cmi_mynode % (sysvshmContext->nodesize); MACHSTATE1(3,"calculateNodeSizeAndRank noderank %d",sysvshmContext->noderank); sysvshmContext->nodestart = _Cmi_mynode -sysvshmContext->noderank; MACHSTATE(3,"calculateNodeSizeAndRank nodestart "); sysvshmContext->nodeend = sysvshmContext->nodestart + sysvshmContext->nodesize -1; if(sysvshmContext->nodeend >= _Cmi_numnodes){ sysvshmContext->nodeend = _Cmi_numnodes-1; sysvshmContext->nodesize = (sysvshmContext->nodeend - sysvshmContext->nodestart) +1; } MACHSTATE3(3,"calculateNodeSizeAndRank nodestart %d nodesize %d noderank %d",sysvshmContext->nodestart,sysvshmContext->nodesize,sysvshmContext->noderank); }
void calculateNodeSizeAndRank(char **argv){ pxshmContext->nodesize=1; MACHSTATE(3,"calculateNodeSizeAndRank start"); //CmiGetArgIntDesc(argv, "+nodesize", &(pxshmContext->nodesize),"Number of cores in this node (for non-smp case).Used by the shared memory communication layer"); CmiGetArgIntDesc(argv, "+nodesize", &(pxshmContext->nodesize),"Number of cores in this node"); MACHSTATE1(3,"calculateNodeSizeAndRank argintdesc %d",pxshmContext->nodesize); pxshmContext->noderank = _Cmi_mynode % (pxshmContext->nodesize); MACHSTATE1(3,"calculateNodeSizeAndRank noderank %d",pxshmContext->noderank); pxshmContext->nodestart = _Cmi_mynode -pxshmContext->noderank; MACHSTATE(3,"calculateNodeSizeAndRank nodestart "); pxshmContext->nodeend = pxshmContext->nodestart + pxshmContext->nodesize -1; if(pxshmContext->nodeend >= _Cmi_numnodes){ pxshmContext->nodeend = _Cmi_numnodes-1; pxshmContext->nodesize = (pxshmContext->nodeend - pxshmContext->nodestart) +1; } MACHSTATE3(3,"calculateNodeSizeAndRank nodestart %d nodesize %d noderank %d",pxshmContext->nodestart,pxshmContext->nodesize,pxshmContext->noderank); }
inline void emptyAllRecvBufs(){ int i; for(i=0;i<pxshmContext->nodesize;i++){ if(i != pxshmContext->noderank){ sharedBufData *recvBuf = &(pxshmContext->recvBufs[i]); if(recvBuf->header->count > 0){ #if PXSHM_STATS pxshmContext->lockRecvCount++; #endif #if PXSHM_OSSPINLOCK if(! OSSpinLockTry(&recvBuf->header->lock)){ #elif PXSHM_LOCK if(sem_trywait(recvBuf->mutex) < 0){ #elif PXSHM_FENCE recvBuf->header->flagReceiver = 1; recvBuf->header->turn = SENDER; CmiMemoryReadFence(0,0); CmiMemoryWriteFence(0,0); //if((recvBuf->header->flagSender && recvBuf->header->turn == SENDER)){ if((recvBuf->header->flagSender)){ recvBuf->header->flagReceiver = 0; #endif }else{ MACHSTATE1(3,"emptyRecvBuf to be called for rank %d",i); emptyRecvBuf(recvBuf); #if PXSHM_OSSPINLOCK OSSpinLockUnlock(&recvBuf->header->lock); #elif PXSHM_LOCK sem_post(recvBuf->mutex); #elif PXSHM_FENCE CmiMemoryReadFence(0,0); CmiMemoryWriteFence(0,0); recvBuf->header->flagReceiver = 0; #endif } } } } }; inline void flushAllSendQs(){ int i; #if SENDQ_LIST int index_prev = -1; i = sendQ_head_index; while (i!= -1) { PxshmSendQ *sendQ = pxshmContext->sendQs[i]; CmiAssert(i != pxshmContext->noderank); if(sendQ->numEntries > 0){ #else for(i=0;i<pxshmContext->nodesize;i++) { if (i == pxshmContext->noderank) continue; PxshmSendQ *sendQ = pxshmContext->sendQs[i]; if(sendQ->numEntries > 0) { #endif #if PXSHM_OSSPINLOCK if(OSSpinLockTry(&pxshmContext->sendBufs[i].header->lock)){ #elif PXSHM_LOCK if(sem_trywait(pxshmContext->sendBufs[i].mutex) >= 0){ #elif PXSHM_FENCE pxshmContext->sendBufs[i].header->flagSender = 1; pxshmContext->sendBufs[i].header->turn = RECEIVER; CmiMemoryReadFence(0,0); CmiMemoryWriteFence(0,0); if(!(pxshmContext->sendBufs[i].header->flagReceiver && pxshmContext->sendBufs[i].header->turn == RECEIVER)){ #endif MACHSTATE1(3,"flushSendQ %d",i); flushSendQ(sendQ); #if PXSHM_OSSPINLOCK OSSpinLockUnlock(&pxshmContext->sendBufs[i].header->lock); #elif PXSHM_LOCK sem_post(pxshmContext->sendBufs[i].mutex); #elif PXSHM_FENCE CmiMemoryReadFence(0,0); CmiMemoryWriteFence(0,0); pxshmContext->sendBufs[i].header->flagSender = 0; #endif }else{ #if PXSHM_FENCE pxshmContext->sendBufs[i].header->flagSender = 0; #endif } } #if SENDQ_LIST if (sendQ->numEntries == 0) { if (index_prev != -1) pxshmContext->sendQs[index_prev]->next = sendQ->next; else sendQ_head_index = sendQ->next; i = sendQ->next; sendQ->next = -2; } else { index_prev = i; i = sendQ->next; } #endif } }; void emptyRecvBuf(sharedBufData *recvBuf){ int numMessages = recvBuf->header->count; int i=0; char *ptr=recvBuf->data; for(i=0;i<numMessages;i++){ int size; int rank, srcpe, seqno, magic, i; unsigned int broot; char *msg = ptr; char *newMsg; size = CMI_MSG_SIZE(msg); newMsg = (char *)CmiAlloc(size); memcpy(newMsg,msg,size); handleOneRecvedMsg(size, newMsg); ptr += size; MACHSTATE3(3,"message of size %d recvd ends at ptr-data %d total bytes %d bytes %d",size,ptr-recvBuf->data,recvBuf->header->bytes); } #if 1 if(ptr - recvBuf->data != recvBuf->header->bytes){ CmiPrintf("[%d] ptr - recvBuf->data %d recvBuf->header->bytes %d numMessages %d \n",_Cmi_mynode, ptr - recvBuf->data, recvBuf->header->bytes,numMessages); } #endif CmiAssert(ptr - recvBuf->data == recvBuf->header->bytes); recvBuf->header->count=0; recvBuf->header->bytes=0; } /************************** *sendQ helper functions * ****************/ void initSendQ(PxshmSendQ *q,int size, int rank){ q->data = (OutgoingMsgRec *)calloc(size, sizeof(OutgoingMsgRec)); q->size = size; q->numEntries = 0; q->begin = 0; q->end = 0; q->rank = rank; #if SENDQ_LIST q->next = -2; #endif } void pushSendQ(PxshmSendQ *q, char *msg, int size, int *refcount){ if(q->numEntries == q->size){ //need to resize OutgoingMsgRec *oldData = q->data; int newSize = q->size<<1; q->data = (OutgoingMsgRec *)calloc(newSize, sizeof(OutgoingMsgRec)); //copy head to the beginning of the new array CmiAssert(q->begin == q->end); CmiAssert(q->begin < q->size); memcpy(&(q->data[0]),&(oldData[q->begin]),sizeof(OutgoingMsgRec)*(q->size - q->begin)); if(q->end!=0){ memcpy(&(q->data[(q->size - q->begin)]),&(oldData[0]),sizeof(OutgoingMsgRec)*(q->end)); } free(oldData); q->begin = 0; q->end = q->size; q->size = newSize; } OutgoingMsgRec *omg = &q->data[q->end]; omg->size = size; omg->data = msg; omg->refcount = refcount; (q->end)++; if(q->end >= q->size){ q->end -= q->size; } q->numEntries++; } OutgoingMsgRec * popSendQ(PxshmSendQ *q){ OutgoingMsgRec * ret; if(0 == q->numEntries){ return NULL; } ret = &q->data[q->begin]; (q->begin)++; if(q->begin >= q->size){ q->begin -= q->size; } q->numEntries--; return ret; }