int MPIDI_CH3_RecvRndv( MPIDI_VC_t * vc, MPIR_Request *rreq ) { int mpi_errno = MPI_SUCCESS; /* A rendezvous request-to-send (RTS) message has arrived. We need to send a CTS message to the remote process. */ MPIR_Request * cts_req; MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_rndv_clr_to_send_t * cts_pkt = &upkt.rndv_clr_to_send; MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE, "rndv RTS in the request, sending rndv CTS"); MPIDI_Pkt_init(cts_pkt, MPIDI_CH3_PKT_RNDV_CLR_TO_SEND); cts_pkt->sender_req_id = rreq->dev.sender_req_id; cts_pkt->receiver_req_id = rreq->handle; MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = MPIDI_CH3_iStartMsg(vc, cts_pkt, sizeof(*cts_pkt), &cts_req); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**ch3|ctspkt"); } if (cts_req != NULL) { /* FIXME: Ideally we could specify that a req not be returned. This would avoid our having to decrement the reference count on a req we don't want/need. */ MPIR_Request_free(cts_req); } fn_fail: return mpi_errno; }
int MPIDI_CH3U_Post_data_receive_unexpected(MPIR_Request * rreq) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_UNEXPECTED); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_UNEXPECTED); /* FIXME: to improve performance, allocate temporary buffer from a specialized buffer pool. */ /* FIXME: to avoid memory exhaustion, integrate buffer pool management with flow control */ MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"unexpected request allocated"); rreq->dev.tmpbuf = MPL_malloc(rreq->dev.recv_data_sz, MPL_MEM_BUFFER); if (!rreq->dev.tmpbuf) { MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %d", rreq->dev.recv_data_sz); } rreq->dev.tmpbuf_sz = rreq->dev.recv_data_sz; rreq->dev.iov[0].MPL_IOV_BUF = (MPL_IOV_BUF_CAST)rreq->dev.tmpbuf; rreq->dev.iov[0].MPL_IOV_LEN = rreq->dev.recv_data_sz; rreq->dev.iov_count = 1; rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_UnpackUEBufComplete; rreq->dev.recv_pending_count = 2; fn_fail: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_UNEXPECTED); return mpi_errno; }
static int do_readv(MPID_Request *rreq, int pipe_fd, MPL_IOV iov[], int *iov_offset, int *iov_count, int *complete) { int mpi_errno = MPI_SUCCESS; ssize_t nread; nread = readv(pipe_fd, &rreq->dev.iov[rreq->dev.iov_offset], rreq->dev.iov_count); MPIR_ERR_CHKANDJUMP2(nread < 0 && errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**read", "**readv %d %s", errno, MPIU_Strerror(errno)); if (nread < 0) { if (errno == EAGAIN) goto fn_exit; MPIR_ERR_CHKANDJUMP2(errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**vmsplice", "**vmsplice %d %s", errno, MPIU_Strerror(errno)); } *complete = adjust_partially_xferred_iov(iov, iov_offset, iov_count, nread); if (*complete) { /* look for additional data to send and reload IOV if there is more */ mpi_errno = check_req_complete(rreq->ch.vc, rreq, complete); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (*complete) { nread = close(pipe_fd); MPIR_ERR_CHKANDJUMP(nread < 0, mpi_errno, MPI_ERR_OTHER, "**close"); MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete"); } } fn_fail: fn_exit: return mpi_errno; }
int MPIDI_CH3_PktPrint_RndvClrToSend( FILE *fp, MPIDI_CH3_Pkt_t *pkt ) { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," type ......... CLR_TO_SEND\n"); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," sender_reqid . 0x%08X\n", pkt->rndv_clr_to_send.sender_req_id)); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," recvr_reqid .. 0x%08X\n", pkt->rndv_clr_to_send.receiver_req_id)); return MPI_SUCCESS; }
static int do_vmsplice(MPID_Request *sreq, int pipe_fd, MPL_IOV iov[], int *iov_offset, int *iov_count, int *complete) { int mpi_errno = MPI_SUCCESS; ssize_t err; #if 1 err = vmsplice(pipe_fd, &iov[*iov_offset], *iov_count, SPLICE_F_NONBLOCK); #else err = writev(pipe_fd, &iov[*iov_offset], *iov_count); #endif if (err < 0) { if (errno == EAGAIN) goto fn_exit; MPIR_ERR_CHKANDJUMP2(errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**vmsplice", "**vmsplice %d %s", errno, MPIU_Strerror(errno)); } *complete = adjust_partially_xferred_iov(iov, iov_offset, iov_count, err); if (*complete) { /* look for additional data to send and reload IOV if there is more */ mpi_errno = check_req_complete(sreq->ch.vc, sreq, complete); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (*complete) { err = close(pipe_fd); MPIR_ERR_CHKANDJUMP(err < 0, mpi_errno, MPI_ERR_OTHER, "**close"); MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete"); } } fn_fail: fn_exit: return mpi_errno; }
int MPIDI_CH3_PktHandler_RndvSend( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, intptr_t *buflen, MPIR_Request **rreqp ) { MPIDI_CH3_Pkt_rndv_send_t * rs_pkt = &pkt->rndv_send; int mpi_errno = MPI_SUCCESS; int complete; char *data_buf; intptr_t data_len; MPIR_Request *req; MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"received rndv send (data) pkt"); MPIR_Request_get_ptr(rs_pkt->receiver_req_id, req); data_len = ((*buflen - sizeof(MPIDI_CH3_Pkt_t) >= req->dev.recv_data_sz) ? req->dev.recv_data_sz : *buflen - sizeof(MPIDI_CH3_Pkt_t)); data_buf = (char *)pkt + sizeof(MPIDI_CH3_Pkt_t); if (req->dev.recv_data_sz == 0) { *buflen = sizeof(MPIDI_CH3_Pkt_t); mpi_errno = MPID_Request_complete(req); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } *rreqp = NULL; } else { mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len, &complete); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER, "**ch3|postrecv", "**ch3|postrecv %s", "MPIDI_CH3_PKT_RNDV_SEND"); } *buflen = sizeof(MPIDI_CH3_Pkt_t) + data_len; if (complete) { mpi_errno = MPID_Request_complete(req); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } *rreqp = NULL; } else { *rreqp = req; } } fn_fail: return mpi_errno; }
int MPIDI_CH3_PktPrint_EagerSyncSend( FILE *fp, MPIDI_CH3_Pkt_t *pkt ) { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," type ......... EAGER_SYNC_SEND\n"); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," sender_reqid . 0x%08X\n", pkt->eager_sync_send.sender_req_id)); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," context_id ... %d\n", pkt->eager_sync_send.match.parts.context_id); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," tag .......... %d\n", pkt->eager_sync_send.match.parts.tag); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," rank ......... %d\n", pkt->eager_sync_send.match.parts.rank); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," data_sz ...... %d\n", pkt->eager_sync_send.data_sz); #ifdef MPID_USE_SEQUENCE_NUMBERS MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," seqnum ....... %d\n", pkt->eager_sync_send.seqnum); #endif return MPI_SUCCESS; }
int MPIR_Thread_CS_Finalize(void) { int err; MPL_DBG_MSG(MPIR_DBG_INIT, TYPICAL, "Freeing global mutex and private storage"); #if MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__GLOBAL /* There is a single, global lock, held for the duration of an MPI call */ MPID_Thread_mutex_destroy(&MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX, &err); MPIR_Assert(err == 0); #elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__POBJ /* MPICH_THREAD_GRANULARITY__POBJ: There are multiple locks, * one for each logical class (e.g., each type of object) */ MPID_Thread_mutex_destroy(&MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX, &err); MPIR_Assert(err == 0); MPID_Thread_mutex_destroy(&MPIR_THREAD_POBJ_HANDLE_MUTEX, &err); MPIR_Assert(err == 0); MPID_Thread_mutex_destroy(&MPIR_THREAD_POBJ_MSGQ_MUTEX, &err); MPIR_Assert(err == 0); MPID_Thread_mutex_destroy(&MPIR_THREAD_POBJ_COMPLETION_MUTEX, &err); MPIR_Assert(err == 0); MPID_Thread_mutex_destroy(&MPIR_THREAD_POBJ_CTX_MUTEX, &err); MPIR_Assert(err == 0); MPID_Thread_mutex_destroy(&MPIR_THREAD_POBJ_PMI_MUTEX, &err); MPIR_Assert(err == 0); #elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__VCI MPID_Thread_mutex_destroy(&MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX, &err); MPIR_Assert(err == 0); MPID_Thread_mutex_destroy(&MPIR_THREAD_POBJ_HANDLE_MUTEX, &err); MPIR_Assert(err == 0); #elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__LOCKFREE /* Updates to shared data and access to shared services is handled without locks where ever possible. */ #error lock-free not yet implemented #elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__SINGLE /* No thread support, make all operations a no-op */ #else #error Unrecognized thread granularity #endif MPID_CS_finalize(); MPID_THREADPRIV_KEY_DESTROY; return MPI_SUCCESS; }
int MPIDI_CH3U_Receive_data_unexpected(MPIR_Request * rreq, void *buf, intptr_t *buflen, int *complete) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_UNEXPECTED); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_UNEXPECTED); /* FIXME: to improve performance, allocate temporary buffer from a specialized buffer pool. */ /* FIXME: to avoid memory exhaustion, integrate buffer pool management with flow control */ MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"unexpected request allocated"); rreq->dev.tmpbuf = MPL_malloc(rreq->dev.recv_data_sz, MPL_MEM_BUFFER); if (!rreq->dev.tmpbuf) { MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %d", rreq->dev.recv_data_sz); } rreq->dev.tmpbuf_sz = rreq->dev.recv_data_sz; /* if all of the data has already been received, copy it now, otherwise build an iov and let the channel copy it */ if (rreq->dev.recv_data_sz <= *buflen) { MPIR_Memcpy(rreq->dev.tmpbuf, buf, rreq->dev.recv_data_sz); *buflen = rreq->dev.recv_data_sz; rreq->dev.recv_pending_count = 1; *complete = TRUE; } else { rreq->dev.iov[0].MPL_IOV_BUF = (MPL_IOV_BUF_CAST)((char *)rreq->dev.tmpbuf); rreq->dev.iov[0].MPL_IOV_LEN = rreq->dev.recv_data_sz; rreq->dev.iov_count = 1; rreq->dev.recv_pending_count = 2; *buflen = 0; *complete = FALSE; } if (MPIDI_Request_get_msg_type(rreq) == MPIDI_REQUEST_EAGER_MSG) MPIR_T_PVAR_LEVEL_INC(RECVQ, unexpected_recvq_buffer_size, rreq->dev.tmpbuf_sz); rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_UnpackUEBufComplete; fn_fail: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_UNEXPECTED); return mpi_errno; }
/* Send a zero-sized message with eager synchronous. This is a temporary routine, as we may want to replace this with a counterpart to the Eager Short message */ int MPIDI_CH3_EagerSyncZero(MPIR_Request **sreq_p, int rank, int tag, MPIR_Comm * comm, int context_offset ) { int mpi_errno = MPI_SUCCESS; MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eager_sync_send_t * const es_pkt = &upkt.eager_sync_send; MPIDI_VC_t * vc; MPIR_Request *sreq = *sreq_p; MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"sending zero length message"); /* MT FIXME what are the two operations we are waiting for? the send and * the sync response? */ MPIR_cc_set(&sreq->cc, 2); MPIDI_Request_set_msg_type(sreq, MPIDI_REQUEST_EAGER_MSG); sreq->dev.OnDataAvail = 0; MPIDI_Pkt_init(es_pkt, MPIDI_CH3_PKT_EAGER_SYNC_SEND); es_pkt->match.parts.rank = comm->rank; es_pkt->match.parts.tag = tag; es_pkt->match.parts.context_id = comm->context_id + context_offset; es_pkt->sender_req_id = sreq->handle; es_pkt->data_sz = 0; MPIDI_Comm_get_vc_set_active(comm, rank, &vc); MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(es_pkt, seqnum); MPIDI_Request_set_seqnum(sreq, seqnum); MPL_DBG_MSGPKT(vc,tag,es_pkt->match.parts.context_id,rank,(intptr_t)0,"EagerSync0"); MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = MPIDI_CH3_iSend(vc, sreq, es_pkt, sizeof(*es_pkt)); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { MPIR_Request_free(sreq); *sreq_p = NULL; MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|eagermsg"); } /* --END ERROR HANDLING-- */ fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
/* These routine handle any thread initialization that my be required */ static int thread_cs_init(void) { int err; #if MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__GLOBAL /* There is a single, global lock, held for the duration of an MPI call */ MPID_Thread_mutex_create(&MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX, &err); MPIR_Assert(err == 0); #elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__POBJ /* MPICH_THREAD_GRANULARITY__POBJ: Multiple locks */ MPID_Thread_mutex_create(&MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX, &err); MPIR_Assert(err == 0); MPID_Thread_mutex_create(&MPIR_THREAD_POBJ_HANDLE_MUTEX, &err); MPIR_Assert(err == 0); MPID_Thread_mutex_create(&MPIR_THREAD_POBJ_MSGQ_MUTEX, &err); MPIR_Assert(err == 0); MPID_Thread_mutex_create(&MPIR_THREAD_POBJ_COMPLETION_MUTEX, &err); MPIR_Assert(err == 0); MPID_Thread_mutex_create(&MPIR_THREAD_POBJ_CTX_MUTEX, &err); MPIR_Assert(err == 0); MPID_Thread_mutex_create(&MPIR_THREAD_POBJ_PMI_MUTEX, &err); MPIR_Assert(err == 0); #elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__VCI MPID_Thread_mutex_create(&MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX, &err); MPIR_Assert(err == 0); MPID_Thread_mutex_create(&MPIR_THREAD_POBJ_HANDLE_MUTEX, &err); MPIR_Assert(err == 0); #elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__LOCKFREE /* Updates to shared data and access to shared services is handled without locks where ever possible. */ #error lock-free not yet implemented #elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__SINGLE /* No thread support, make all operations a no-op */ #else #error Unrecognized thread granularity #endif MPID_THREADPRIV_KEY_CREATE; MPL_DBG_MSG(MPIR_DBG_INIT, TYPICAL, "Created global mutex and private storage"); return MPI_SUCCESS; }
int MPIDI_CH3_Connection_terminate(MPIDI_VC_t * vc) { int mpi_errno = MPI_SUCCESS; MPIDI_CH3I_VC *vcch = &vc->ch; MPL_DBG_CONNSTATECHANGE(vc,vcch->conn,CONN_STATE_CLOSING); vcch->conn->state = CONN_STATE_CLOSING; MPL_DBG_MSG(MPIDI_CH3_DBG_DISCONNECT,TYPICAL,"Closing sock (Post_close)"); mpi_errno = MPIDI_CH3I_Sock_post_close(vcch->sock); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
/* * These routines are called when a receive matches an eager sync send */ int MPIDI_CH3_EagerSyncAck( MPIDI_VC_t *vc, MPIR_Request *rreq ) { int mpi_errno = MPI_SUCCESS; MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eager_sync_ack_t * const esa_pkt = &upkt.eager_sync_ack; MPIR_Request * esa_req; MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"sending eager sync ack"); MPIDI_Pkt_init(esa_pkt, MPIDI_CH3_PKT_EAGER_SYNC_ACK); esa_pkt->sender_req_id = rreq->dev.sender_req_id; MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = MPIDI_CH3_iStartMsg(vc, esa_pkt, sizeof(*esa_pkt), &esa_req); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } if (esa_req != NULL) { MPIR_Request_free(esa_req); } fn_fail: return mpi_errno; }
void MPIDI_CH3I_Progress_wakeup(void) { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TYPICAL,"progress_wakeup called"); MPIDI_CH3I_Sock_wakeup(MPIDI_CH3I_sock_set); }
int MPIDI_CH3_iSend(MPIDI_VC_t * vc, MPIR_Request * sreq, void * hdr, intptr_t hdr_sz) { int mpi_errno = MPI_SUCCESS; int (*reqFn)(MPIDI_VC_t *, MPIR_Request *, int *); MPIDI_CH3I_VC *vcch = &vc->ch; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_ISEND); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3_ISEND); MPIR_Assert( hdr_sz <= sizeof(MPIDI_CH3_Pkt_t) ); /* The sock channel uses a fixed length header, the size of which is the maximum of all possible packet headers */ hdr_sz = sizeof(MPIDI_CH3_Pkt_t); MPL_DBG_STMT(MPIDI_CH3_DBG_CHANNEL,VERBOSE, MPIDI_DBG_Print_packet((MPIDI_CH3_Pkt_t*)hdr)); if (vcch->state == MPIDI_CH3I_VC_STATE_CONNECTED) /* MT */ { /* Connection already formed. If send queue is empty attempt to send data, queuing any unsent data. */ if (MPIDI_CH3I_SendQ_empty(vcch)) /* MT */ { size_t nb; int rc; MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL,VERBOSE, "send queue empty, attempting to write"); MPL_DBG_PKT(vcch->conn,hdr,"isend"); /* MT: need some signalling to lock down our right to use the channel, thus insuring that the progress engine does also try to write */ rc = MPIDI_CH3I_Sock_write(vcch->sock, hdr, hdr_sz, &nb); if (rc == MPI_SUCCESS) { MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL,VERBOSE, "wrote %ld bytes", (unsigned long) nb); if (nb == hdr_sz) { MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL,VERBOSE, "write complete %" PRIdPTR " bytes, calling OnDataAvail fcn", nb); reqFn = sreq->dev.OnDataAvail; if (!reqFn) { MPIR_Assert(MPIDI_Request_get_type(sreq)!=MPIDI_REQUEST_TYPE_GET_RESP); mpi_errno = MPID_Request_complete(sreq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } } else { int complete; mpi_errno = reqFn( vc, sreq, &complete ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (!complete) { MPIDI_CH3I_SendQ_enqueue_head(vcch, sreq); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CHANNEL,VERBOSE, (MPL_DBG_FDEST, "posting writev, vc=0x%p, sreq=0x%08x", vc, sreq->handle)); vcch->conn->send_active = sreq; mpi_errno = MPIDI_CH3I_Sock_post_writev( vcch->conn->sock, sreq->dev.iov, sreq->dev.iov_count, NULL); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**ch3|sock|postwrite", "ch3|sock|postwrite %p %p %p", sreq, vcch->conn, vc); } /* --END ERROR HANDLING-- */ } } } else { MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL,VERBOSE, "partial write of %" PRIdPTR " bytes, request enqueued at head", nb); update_request(sreq, hdr, hdr_sz, nb); MPIDI_CH3I_SendQ_enqueue_head(vcch, sreq); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CHANNEL,VERBOSE, (MPL_DBG_FDEST,"posting write, vc=0x%p, sreq=0x%08x", vc, sreq->handle)); vcch->conn->send_active = sreq; mpi_errno = MPIDI_CH3I_Sock_post_write(vcch->conn->sock, sreq->dev.iov[0].MPL_IOV_BUF, sreq->dev.iov[0].MPL_IOV_LEN, sreq->dev.iov[0].MPL_IOV_LEN, NULL); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**ch3|sock|postwrite", "ch3|sock|postwrite %p %p %p", sreq, vcch->conn, vc); } /* --END ERROR HANDLING-- */ } } /* --BEGIN ERROR HANDLING-- */ else if (MPIR_ERR_GET_CLASS(rc) == MPIDI_CH3I_SOCK_ERR_NOMEM) { MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL,TYPICAL, "MPIDI_CH3I_Sock_write failed, out of memory"); sreq->status.MPI_ERROR = MPIR_ERR_MEMALLOCFAILED; } else { MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL,TYPICAL, "MPIDI_CH3I_Sock_write failed, rc=%d", rc); /* Connection just failed. Mark the request complete and return an error. */ MPL_DBG_VCCHSTATECHANGE(vc,VC_STATE_FAILED); /* FIXME: Shouldn't the vc->state also change? */ vcch->state = MPIDI_CH3I_VC_STATE_FAILED; sreq->status.MPI_ERROR = MPIR_Err_create_code( rc, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_INTERN, "**ch3|sock|writefailed", "**ch3|sock|writefailed %d", rc ); /* MT -CH3U_Request_complete() performs write barrier */ MPID_Request_complete(sreq); /* Make sure that the caller sees this error */ mpi_errno = sreq->status.MPI_ERROR; } /* --END ERROR HANDLING-- */ } else { MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL,VERBOSE,"send queue not empty, enqueuing"); update_request(sreq, hdr, hdr_sz, 0); MPIDI_CH3I_SendQ_enqueue(vcch, sreq); } } else if (vcch->state == MPIDI_CH3I_VC_STATE_CONNECTING) /* MT */ { /* queuing the data so it can be sent later. */ MPL_DBG_VCUSE(vc,"connecting. enqueuing request"); update_request(sreq, hdr, hdr_sz, 0); MPIDI_CH3I_SendQ_enqueue(vcch, sreq); } else if (vcch->state == MPIDI_CH3I_VC_STATE_UNCONNECTED) /* MT */ { /* Form a new connection, queuing the data so it can be sent later. */ MPL_DBG_VCUSE(vc,"unconnected. enqueuing request"); update_request(sreq, hdr, hdr_sz, 0); MPIDI_CH3I_SendQ_enqueue(vcch, sreq); mpi_errno = MPIDI_CH3I_VC_post_connect(vc); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } } else if (vcch->state != MPIDI_CH3I_VC_STATE_FAILED) { /* Unable to send data at the moment, so queue it for later */ MPL_DBG_VCUSE(vc,"still connecting. Enqueuing request"); update_request(sreq, hdr, hdr_sz, 0); MPIDI_CH3I_SendQ_enqueue(vcch, sreq); } /* --BEGIN ERROR HANDLING-- */ else { /* Connection failed. Mark the request complete and return an error. */ /* TODO: Create an appropriate error message */ sreq->status.MPI_ERROR = MPI_ERR_INTERN; /* MT - CH3U_Request_complete() performs write barrier */ MPID_Request_complete(sreq); } /* --END ERROR HANDLING-- */ fn_fail: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3_ISEND); return mpi_errno; }
void MPII_Segment_manipulate(struct MPIR_Segment *segp, MPI_Aint first, MPI_Aint * lastp, int (*contigfn) (MPI_Aint * blocks_p, MPI_Datatype el_type, MPI_Aint rel_off, void *bufp, void *v_paramp), int (*vectorfn) (MPI_Aint * blocks_p, MPI_Aint count, MPI_Aint blklen, MPI_Aint stride, MPI_Datatype el_type, MPI_Aint rel_off, void *bufp, void *v_paramp), int (*blkidxfn) (MPI_Aint * blocks_p, MPI_Aint count, MPI_Aint blklen, MPI_Aint * offsetarray, MPI_Datatype el_type, MPI_Aint rel_off, void *bufp, void *v_paramp), int (*indexfn) (MPI_Aint * blocks_p, MPI_Aint count, MPI_Aint * blockarray, MPI_Aint * offsetarray, MPI_Datatype el_type, MPI_Aint rel_off, void *bufp, void *v_paramp), MPI_Aint(*sizefn) (MPI_Datatype el_type), void *pieceparams) { /* these four are the "local values": cur_sp, valid_sp, last, stream_off */ int cur_sp, valid_sp; MPI_Aint last, stream_off; struct MPII_Dataloop_stackelm *cur_elmp; enum { PF_NULL, PF_CONTIG, PF_VECTOR, PF_BLOCKINDEXED, PF_INDEXED } piecefn_type = PF_NULL; SEGMENT_LOAD_LOCAL_VALUES; if (first == *lastp) { /* nothing to do */ MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE, (MPL_DBG_FDEST, "dloop_segment_manipulate: warning: first == last (" MPI_AINT_FMT_DEC_SPEC ")\n", first)); return; } /* first we ensure that stream_off and first are in the same spot */ if (first != stream_off) { #ifdef MPII_DATALOOP_DEBUG_MANIPULATE MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE, (MPL_DBG_FDEST, "first=" MPI_AINT_FMT_DEC_SPEC "; stream_off=" MPI_AINT_FMT_DEC_SPEC "; resetting.\n", first, stream_off)); #endif if (first < stream_off) { SEGMENT_RESET_VALUES; stream_off = 0; } if (first != stream_off) { MPI_Aint tmp_last = first; /* use manipulate function with a NULL piecefn to advance * stream offset */ MPII_Segment_manipulate(segp, stream_off, &tmp_last, NULL, /* contig fn */ NULL, /* vector fn */ NULL, /* blkidx fn */ NULL, /* index fn */ sizefn, NULL); /* --BEGIN ERROR HANDLING-- */ /* verify that we're in the right location */ MPIR_Assert(tmp_last == first); /* --END ERROR HANDLING-- */ } SEGMENT_LOAD_LOCAL_VALUES; #ifdef MPII_DATALOOP_DEBUG_MANIPULATE MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE, (MPL_DBG_FDEST, "done repositioning stream_off; first=" MPI_AINT_FMT_DEC_SPEC ", stream_off=" MPI_AINT_FMT_DEC_SPEC ", last=" MPI_AINT_FMT_DEC_SPEC "\n", first, stream_off, last)); #endif } for (;;) { #ifdef MPII_DATALOOP_DEBUG_MANIPULATE #if 0 MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE, (MPL_DBG_FDEST, "looptop; cur_sp=%d, cur_elmp=%x\n", cur_sp, (unsigned) cur_elmp)); #endif #endif if (cur_elmp->loop_p->kind & MPII_DATALOOP_FINAL_MASK) { int piecefn_indicated_exit = -1; MPI_Aint myblocks, local_el_size, stream_el_size; MPI_Datatype el_type; /* structs are never finals (leaves) */ MPIR_Assert((cur_elmp->loop_p->kind & MPII_DATALOOP_KIND_MASK) != MPII_DATALOOP_KIND_STRUCT); /* pop immediately on zero count */ if (cur_elmp->curcount == 0) SEGMENT_POP_AND_MAYBE_EXIT; /* size on this system of the int, double, etc. that is * the elementary type. */ local_el_size = cur_elmp->loop_p->el_size; el_type = cur_elmp->loop_p->el_type; stream_el_size = (sizefn) ? sizefn(el_type) : local_el_size; /* calculate number of elem. types to work on and function to use. * default is to use the contig piecefn (if there is one). */ myblocks = cur_elmp->curblock; piecefn_type = (contigfn ? PF_CONTIG : PF_NULL); /* check for opportunities to use other piecefns */ switch (cur_elmp->loop_p->kind & MPII_DATALOOP_KIND_MASK) { case MPII_DATALOOP_KIND_CONTIG: break; case MPII_DATALOOP_KIND_BLOCKINDEXED: /* only use blkidx piecefn if at start of blkidx type */ if (blkidxfn && cur_elmp->orig_block == cur_elmp->curblock && cur_elmp->orig_count == cur_elmp->curcount) { /* TODO: RELAX CONSTRAINTS */ myblocks = cur_elmp->curblock * cur_elmp->curcount; piecefn_type = PF_BLOCKINDEXED; } break; case MPII_DATALOOP_KIND_INDEXED: /* only use index piecefn if at start of the index type. * count test checks that we're on first block. * block test checks that we haven't made progress on first block. */ if (indexfn && cur_elmp->orig_count == cur_elmp->curcount && cur_elmp->curblock == STACKELM_INDEXED_BLOCKSIZE(cur_elmp, 0)) { /* TODO: RELAX CONSTRAINT ON COUNT? */ myblocks = cur_elmp->loop_p->loop_params.i_t.total_blocks; piecefn_type = PF_INDEXED; } break; case MPII_DATALOOP_KIND_VECTOR: /* only use the vector piecefn if at the start of a * contiguous block. */ if (vectorfn && cur_elmp->orig_block == cur_elmp->curblock) { myblocks = cur_elmp->curblock * cur_elmp->curcount; piecefn_type = PF_VECTOR; } break; default: /* --BEGIN ERROR HANDLING-- */ MPIR_Assert(0); break; /* --END ERROR HANDLING-- */ } #ifdef MPII_DATALOOP_DEBUG_MANIPULATE MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE, (MPL_DBG_FDEST, "\thit leaf; cur_sp=%d, elmp=%x, piece_sz=" MPI_AINT_FMT_DEC_SPEC "\n", cur_sp, (unsigned) cur_elmp, myblocks * local_el_size)); #endif /* enforce the last parameter if necessary by reducing myblocks */ if (last != MPIR_SEGMENT_IGNORE_LAST && (stream_off + (myblocks * stream_el_size) > last)) { myblocks = ((last - stream_off) / stream_el_size); #ifdef MPII_DATALOOP_DEBUG_MANIPULATE MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE, (MPL_DBG_FDEST, "\tpartial block count=" MPI_AINT_FMT_DEC_SPEC " (" MPI_AINT_FMT_DEC_SPEC " bytes)\n", myblocks, myblocks * stream_el_size)); #endif if (myblocks == 0) { SEGMENT_SAVE_LOCAL_VALUES; return; } } /* call piecefn to perform data manipulation */ switch (piecefn_type) { case PF_NULL: piecefn_indicated_exit = 0; #ifdef MPII_DATALOOP_DEBUG_MANIPULATE MPL_DBG_MSG("\tNULL piecefn for this piece\n"); #endif break; case PF_CONTIG: MPIR_Assert(myblocks <= cur_elmp->curblock); piecefn_indicated_exit = contigfn(&myblocks, el_type, cur_elmp->curoffset, /* relative to segp->ptr */ segp->ptr, /* start of buffer (from segment) */ pieceparams); break; case PF_VECTOR: piecefn_indicated_exit = vectorfn(&myblocks, cur_elmp->curcount, cur_elmp->orig_block, cur_elmp->loop_p->loop_params.v_t.stride, el_type, cur_elmp->curoffset, segp->ptr, pieceparams); break; case PF_BLOCKINDEXED: piecefn_indicated_exit = blkidxfn(&myblocks, cur_elmp->curcount, cur_elmp->orig_block, cur_elmp->loop_p->loop_params.bi_t.offset_array, el_type, cur_elmp->orig_offset, /* blkidxfn adds offset */ segp->ptr, pieceparams); break; case PF_INDEXED: piecefn_indicated_exit = indexfn(&myblocks, cur_elmp->curcount, cur_elmp->loop_p->loop_params.i_t.blocksize_array, cur_elmp->loop_p->loop_params.i_t.offset_array, el_type, cur_elmp->orig_offset, /* indexfn adds offset value */ segp->ptr, pieceparams); break; } /* update local values based on piecefn returns (myblocks and * piecefn_indicated_exit) */ MPIR_Assert(piecefn_indicated_exit >= 0); MPIR_Assert(myblocks >= 0); stream_off += myblocks * stream_el_size; /* myblocks of 0 or less than cur_elmp->curblock indicates * that we should stop processing and return. */ if (myblocks == 0) { SEGMENT_SAVE_LOCAL_VALUES; return; } else if (myblocks < (MPI_Aint) (cur_elmp->curblock)) { cur_elmp->curoffset += myblocks * local_el_size; cur_elmp->curblock -= myblocks; SEGMENT_SAVE_LOCAL_VALUES; return; } else { /* myblocks >= cur_elmp->curblock */ MPI_Aint count_index = 0; /* this assumes we're either *just* processing the last parts * of the current block, or we're processing as many blocks as * we like starting at the beginning of one. */ switch (cur_elmp->loop_p->kind & MPII_DATALOOP_KIND_MASK) { case MPII_DATALOOP_KIND_INDEXED: while (myblocks > 0 && myblocks >= (MPI_Aint) (cur_elmp->curblock)) { myblocks -= (MPI_Aint) (cur_elmp->curblock); cur_elmp->curcount--; MPIR_Assert(cur_elmp->curcount >= 0); count_index = cur_elmp->orig_count - cur_elmp->curcount; cur_elmp->curblock = STACKELM_INDEXED_BLOCKSIZE(cur_elmp, count_index); } if (cur_elmp->curcount == 0) { /* don't bother to fill in values; we're popping anyway */ MPIR_Assert(myblocks == 0); SEGMENT_POP_AND_MAYBE_EXIT; } else { cur_elmp->orig_block = cur_elmp->curblock; cur_elmp->curoffset = cur_elmp->orig_offset + STACKELM_INDEXED_OFFSET(cur_elmp, count_index); cur_elmp->curblock -= myblocks; cur_elmp->curoffset += myblocks * local_el_size; } break; case MPII_DATALOOP_KIND_VECTOR: /* this math relies on assertions at top of code block */ cur_elmp->curcount -= myblocks / (MPI_Aint) (cur_elmp->curblock); if (cur_elmp->curcount == 0) { MPIR_Assert(myblocks % ((MPI_Aint) (cur_elmp->curblock)) == 0); SEGMENT_POP_AND_MAYBE_EXIT; } else { /* this math relies on assertions at top of code * block */ cur_elmp->curblock = cur_elmp->orig_block - (myblocks % (MPI_Aint) (cur_elmp->curblock)); /* new offset = original offset + * stride * whole blocks + * leftover bytes */ cur_elmp->curoffset = cur_elmp->orig_offset + (((MPI_Aint) (cur_elmp->orig_count - cur_elmp->curcount)) * cur_elmp->loop_p->loop_params.v_t.stride) + (((MPI_Aint) (cur_elmp->orig_block - cur_elmp->curblock)) * local_el_size); } break; case MPII_DATALOOP_KIND_CONTIG: /* contigs that reach this point have always been * completely processed */ MPIR_Assert(myblocks == (MPI_Aint) (cur_elmp->curblock) && cur_elmp->curcount == 1); SEGMENT_POP_AND_MAYBE_EXIT; break; case MPII_DATALOOP_KIND_BLOCKINDEXED: while (myblocks > 0 && myblocks >= (MPI_Aint) (cur_elmp->curblock)) { myblocks -= (MPI_Aint) (cur_elmp->curblock); cur_elmp->curcount--; MPIR_Assert(cur_elmp->curcount >= 0); count_index = cur_elmp->orig_count - cur_elmp->curcount; cur_elmp->curblock = cur_elmp->orig_block; } if (cur_elmp->curcount == 0) { /* popping */ MPIR_Assert(myblocks == 0); SEGMENT_POP_AND_MAYBE_EXIT; } else { /* cur_elmp->orig_block = cur_elmp->curblock; */ cur_elmp->curoffset = cur_elmp->orig_offset + STACKELM_BLOCKINDEXED_OFFSET(cur_elmp, count_index); cur_elmp->curblock -= myblocks; cur_elmp->curoffset += myblocks * local_el_size; } break; } } if (piecefn_indicated_exit) { /* piece function indicated that we should quit processing */ SEGMENT_SAVE_LOCAL_VALUES; return; } } /* end of if leaf */ else if (cur_elmp->curblock == 0) { #ifdef MPII_DATALOOP_DEBUG_MANIPULATE MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE, (MPL_DBG_FDEST, "\thit end of block; elmp=%x [%d]\n", (unsigned) cur_elmp, cur_sp)); #endif cur_elmp->curcount--; /* new block. for indexed and struct reset orig_block. * reset curblock for all types */ switch (cur_elmp->loop_p->kind & MPII_DATALOOP_KIND_MASK) { case MPII_DATALOOP_KIND_CONTIG: case MPII_DATALOOP_KIND_VECTOR: case MPII_DATALOOP_KIND_BLOCKINDEXED: break; case MPII_DATALOOP_KIND_INDEXED: cur_elmp->orig_block = STACKELM_INDEXED_BLOCKSIZE(cur_elmp, cur_elmp->curcount ? cur_elmp->orig_count - cur_elmp->curcount : 0); break; case MPII_DATALOOP_KIND_STRUCT: cur_elmp->orig_block = STACKELM_STRUCT_BLOCKSIZE(cur_elmp, cur_elmp->curcount ? cur_elmp->orig_count - cur_elmp->curcount : 0); break; default: /* --BEGIN ERROR HANDLING-- */ MPIR_Assert(0); break; /* --END ERROR HANDLING-- */ } cur_elmp->curblock = cur_elmp->orig_block; if (cur_elmp->curcount == 0) { #ifdef MPII_DATALOOP_DEBUG_MANIPULATE MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE, (MPL_DBG_FDEST, "\talso hit end of count; elmp=%x [%d]\n", (unsigned) cur_elmp, cur_sp)); #endif SEGMENT_POP_AND_MAYBE_EXIT; } } else { /* push the stackelm */ MPII_Dataloop_stackelm *next_elmp; MPI_Aint count_index, block_index; count_index = cur_elmp->orig_count - cur_elmp->curcount; block_index = cur_elmp->orig_block - cur_elmp->curblock; /* reload the next stackelm if necessary */ next_elmp = &(segp->stackelm[cur_sp + 1]); if (cur_elmp->may_require_reloading) { MPIR_Dataloop *load_dlp = NULL; switch (cur_elmp->loop_p->kind & MPII_DATALOOP_KIND_MASK) { case MPII_DATALOOP_KIND_CONTIG: case MPII_DATALOOP_KIND_VECTOR: case MPII_DATALOOP_KIND_BLOCKINDEXED: case MPII_DATALOOP_KIND_INDEXED: load_dlp = cur_elmp->loop_p->loop_params.cm_t.dataloop; break; case MPII_DATALOOP_KIND_STRUCT: load_dlp = STACKELM_STRUCT_DATALOOP(cur_elmp, count_index); break; default: /* --BEGIN ERROR HANDLING-- */ MPIR_Assert(0); break; /* --END ERROR HANDLING-- */ } #ifdef MPII_DATALOOP_DEBUG_MANIPULATE MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE, (MPL_DBG_FDEST, "\tloading dlp=%x, elmp=%x [%d]\n", (unsigned) load_dlp, (unsigned) next_elmp, cur_sp + 1)); #endif MPII_Dataloop_stackelm_load(next_elmp, load_dlp, 1); } #ifdef MPII_DATALOOP_DEBUG_MANIPULATE MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE, (MPL_DBG_FDEST, "\tpushing type, elmp=%x [%d], count=%d, block=%d\n", (unsigned) cur_elmp, cur_sp, count_index, block_index)); #endif /* set orig_offset and all cur values for new stackelm. * this is done in two steps: first set orig_offset based on * current stackelm, then set cur values based on new stackelm. */ switch (cur_elmp->loop_p->kind & MPII_DATALOOP_KIND_MASK) { case MPII_DATALOOP_KIND_CONTIG: next_elmp->orig_offset = cur_elmp->curoffset + (MPI_Aint) block_index *cur_elmp->loop_p->el_extent; break; case MPII_DATALOOP_KIND_VECTOR: /* note: stride is in bytes */ next_elmp->orig_offset = cur_elmp->orig_offset + (MPI_Aint) count_index *cur_elmp->loop_p->loop_params.v_t.stride + (MPI_Aint) block_index *cur_elmp->loop_p->el_extent; break; case MPII_DATALOOP_KIND_BLOCKINDEXED: next_elmp->orig_offset = cur_elmp->orig_offset + (MPI_Aint) block_index *cur_elmp->loop_p->el_extent + STACKELM_BLOCKINDEXED_OFFSET(cur_elmp, count_index); break; case MPII_DATALOOP_KIND_INDEXED: next_elmp->orig_offset = cur_elmp->orig_offset + (MPI_Aint) block_index *cur_elmp->loop_p->el_extent + STACKELM_INDEXED_OFFSET(cur_elmp, count_index); break; case MPII_DATALOOP_KIND_STRUCT: next_elmp->orig_offset = cur_elmp->orig_offset + (MPI_Aint) block_index *STACKELM_STRUCT_EL_EXTENT(cur_elmp, count_index) + STACKELM_STRUCT_OFFSET(cur_elmp, count_index); break; default: /* --BEGIN ERROR HANDLING-- */ MPIR_Assert(0); break; /* --END ERROR HANDLING-- */ } #ifdef MPII_DATALOOP_DEBUG_MANIPULATE MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE, (MPL_DBG_FDEST, "\tstep 1: next orig_offset = " MPI_AINT_FMT_DEC_SPEC " (0x" MPI_AINT_FMT_HEX_SPEC ")\n", next_elmp->orig_offset, next_elmp->orig_offset)); #endif switch (next_elmp->loop_p->kind & MPII_DATALOOP_KIND_MASK) { case MPII_DATALOOP_KIND_CONTIG: case MPII_DATALOOP_KIND_VECTOR: next_elmp->curcount = next_elmp->orig_count; next_elmp->curblock = next_elmp->orig_block; next_elmp->curoffset = next_elmp->orig_offset; break; case MPII_DATALOOP_KIND_BLOCKINDEXED: next_elmp->curcount = next_elmp->orig_count; next_elmp->curblock = next_elmp->orig_block; next_elmp->curoffset = next_elmp->orig_offset + STACKELM_BLOCKINDEXED_OFFSET(next_elmp, 0); break; case MPII_DATALOOP_KIND_INDEXED: next_elmp->curcount = next_elmp->orig_count; next_elmp->curblock = STACKELM_INDEXED_BLOCKSIZE(next_elmp, 0); next_elmp->curoffset = next_elmp->orig_offset + STACKELM_INDEXED_OFFSET(next_elmp, 0); break; case MPII_DATALOOP_KIND_STRUCT: next_elmp->curcount = next_elmp->orig_count; next_elmp->curblock = STACKELM_STRUCT_BLOCKSIZE(next_elmp, 0); next_elmp->curoffset = next_elmp->orig_offset + STACKELM_STRUCT_OFFSET(next_elmp, 0); break; default: /* --BEGIN ERROR HANDLING-- */ MPIR_Assert(0); break; /* --END ERROR HANDLING-- */ } #ifdef MPII_DATALOOP_DEBUG_MANIPULATE MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE, (MPL_DBG_FDEST, "\tstep 2: next curoffset = " MPI_AINT_FMT_DEC_SPEC " (0x" MPI_AINT_FMT_HEX_SPEC ")\n", next_elmp->curoffset, next_elmp->curoffset)); #endif cur_elmp->curblock--; SEGMENT_PUSH; } /* end of else push the stackelm */ } /* end of for (;;) */ #ifdef MPII_DATALOOP_DEBUG_MANIPULATE MPL_DBG_MSG("hit end of datatype\n"); #endif SEGMENT_SAVE_LOCAL_VALUES; return; }
int MPID_Cancel_send(MPIR_Request * sreq) { MPIDI_VC_t * vc; int proto; int flag; int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_CANCEL_SEND); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_CANCEL_SEND); MPIR_Assert(sreq->kind == MPIR_REQUEST_KIND__SEND); MPIDI_Request_cancel_pending(sreq, &flag); if (flag) { goto fn_exit; } /* * FIXME: user requests returned by MPI_Ibsend() have a NULL comm pointer * and no pointer to the underlying communication * request. For now, we simply fail to cancel the request. In the future, * we should add a new request kind to indicate that * the request is a BSEND. Then we can properly cancel the request, much * in the way we do persistent requests. */ if (sreq->comm == NULL) { goto fn_exit; } MPIDI_Comm_get_vc_set_active(sreq->comm, sreq->dev.match.parts.rank, &vc); proto = MPIDI_Request_get_msg_type(sreq); if (proto == MPIDI_REQUEST_SELF_MSG) { MPIR_Request * rreq; MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE, "attempting to cancel message sent to self"); MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_MSGQ_MUTEX); rreq = MPIDI_CH3U_Recvq_FDU(sreq->handle, &sreq->dev.match); MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_MSGQ_MUTEX); if (rreq) { MPIR_Assert(rreq->dev.partner_request == sreq); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST, "send-to-self cancellation successful, sreq=0x%08x, rreq=0x%08x", sreq->handle, rreq->handle)); /* Pull the message out of the unexpected queue since it's * being cancelled. The below request release drops one * reference. We explicitly drop a second reference, * because the receive request will never be visible to * the user. */ MPIR_Request_free(rreq); MPIR_Request_free(rreq); MPIR_STATUS_SET_CANCEL_BIT(sreq->status, TRUE); mpi_errno = MPID_Request_complete(sreq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } } else { MPIR_STATUS_SET_CANCEL_BIT(sreq->status, FALSE); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST, "send-to-self cancellation failed, sreq=0x%08x, rreq=0x%08x", sreq->handle, rreq->handle)); } goto fn_exit; } /* If the message went over a netmod and it provides a cancel_send function, call it here. */ #ifdef ENABLE_COMM_OVERRIDES if (vc->comm_ops && vc->comm_ops->cancel_send) { mpi_errno = vc->comm_ops->cancel_send(vc, sreq); goto fn_exit; } #endif /* Check to see if the send is still in the send queue. If so, remove it, mark the request and cancelled and complete, and release the device's reference to the request object. */ { int cancelled; if (proto == MPIDI_REQUEST_RNDV_MSG) { MPIR_Request * rts_sreq; /* The cancellation of the RTS request needs to be atomic through the destruction of the RTS request to avoid conflict with release of the RTS request if the CTS is received (see handling of a rendezvous CTS packet in MPIDI_CH3U_Handle_recv_pkt()). MPID_Request_fetch_and_clear_rts_sreq() is used to gurantee that atomicity. */ MPIDI_Request_fetch_and_clear_rts_sreq(sreq, &rts_sreq); if (rts_sreq != NULL) { cancelled = FALSE; /* since we attempted to cancel a RTS request, then we are responsible for releasing that request */ MPIR_Request_free(rts_sreq); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER, "**ch3|cancelrndv", 0); goto fn_exit; } /* --END ERROR HANDLING-- */ if (cancelled) { MPIR_STATUS_SET_CANCEL_BIT(sreq->status, TRUE); /* no other thread should be waiting on sreq, so it is safe to reset ref_count and cc */ MPIR_cc_set(&sreq->cc, 0); /* FIXME should be a decr and assert, not a set */ MPIR_Object_set_ref(sreq, 1); goto fn_exit; } } } else { cancelled = FALSE; if (cancelled) { MPIR_STATUS_SET_CANCEL_BIT(sreq->status, TRUE); /* no other thread should be waiting on sreq, so it is safe to reset ref_count and cc */ MPIR_cc_set(&sreq->cc, 0); /* FIXME should be a decr and assert, not a set */ MPIR_Object_set_ref(sreq, 1); goto fn_exit; } } } /* Part or all of the message has already been sent, so we need to send a cancellation request to the receiver in an attempt to catch the message before it is matched. */ { int was_incomplete; MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_cancel_send_req_t * const csr_pkt = &upkt.cancel_send_req; MPIR_Request * csr_sreq; MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST, "sending cancel request to %d for 0x%08x", sreq->dev.match.parts.rank, sreq->handle)); /* The completion counter and reference count are incremented to keep the request around long enough to receive a response regardless of what the user does (free the request before waiting, etc.). */ MPIDI_CH3U_Request_increment_cc(sreq, &was_incomplete); if (!was_incomplete) { /* The reference count is incremented only if the request was complete before the increment. */ MPIR_Request_add_ref( sreq ); } MPIDI_Pkt_init(csr_pkt, MPIDI_CH3_PKT_CANCEL_SEND_REQ); csr_pkt->match.parts.rank = sreq->comm->rank; csr_pkt->match.parts.tag = sreq->dev.match.parts.tag; csr_pkt->match.parts.context_id = sreq->dev.match.parts.context_id; csr_pkt->sender_req_id = sreq->handle; MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = MPIDI_CH3_iStartMsg(vc, csr_pkt, sizeof(*csr_pkt), &csr_sreq); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|cancelreq"); } if (csr_sreq != NULL) { MPIR_Request_free(csr_sreq); } } /* FIXME: if send cancellation packets are allowed to arrive out-of-order with respect to send packets, then we need to timestamp send and cancel packets to insure that a cancellation request does not bypass the send packet to be cancelled and erroneously cancel a previously sent message with the same request handle. */ /* FIXME: A timestamp is more than is necessary; a message sequence number should be adequate. */ fn_fail: fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_CANCEL_SEND); return mpi_errno; }
int MPIDI_CH3U_Post_data_receive_found(MPIR_Request * rreq) { int mpi_errno = MPI_SUCCESS; int dt_contig; MPI_Aint dt_true_lb; intptr_t userbuf_sz; MPIR_Datatype * dt_ptr = NULL; intptr_t data_sz; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_FOUND); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_FOUND); MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"posted request found"); MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, userbuf_sz, dt_ptr, dt_true_lb); if (rreq->dev.recv_data_sz <= userbuf_sz) { data_sz = rreq->dev.recv_data_sz; } else { MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST, "receive buffer too small; message truncated, msg_sz=%" PRIdPTR ", userbuf_sz=%" PRIdPTR, rreq->dev.recv_data_sz, userbuf_sz)); rreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d %d %d", rreq->status.MPI_SOURCE, rreq->status.MPI_TAG, rreq->dev.recv_data_sz, userbuf_sz ); MPIR_STATUS_SET_COUNT(rreq->status, userbuf_sz); data_sz = userbuf_sz; } if (dt_contig && data_sz == rreq->dev.recv_data_sz) { /* user buffer is contiguous and large enough to store the entire message. However, we haven't yet *read* the data (this code describes how to read the data into the destination) */ MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"IOV loaded for contiguous read"); rreq->dev.iov[0].MPL_IOV_BUF = (MPL_IOV_BUF_CAST)((char*)(rreq->dev.user_buf) + dt_true_lb); rreq->dev.iov[0].MPL_IOV_LEN = data_sz; rreq->dev.iov_count = 1; /* FIXME: We want to set the OnDataAvail to the appropriate function, which depends on whether this is an RMA request or a pt-to-pt request. */ rreq->dev.OnDataAvail = 0; } else { /* user buffer is not contiguous or is too small to hold the entire message */ MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"IOV loaded for non-contiguous read"); rreq->dev.segment_ptr = MPIR_Segment_alloc( ); MPIR_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPIR_Segment_alloc"); MPIR_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_SETFATALANDJUMP(mpi_errno,MPI_ERR_OTHER, "**ch3|loadrecviov"); } } fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_FOUND); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3_iStartMsgv(MPIDI_VC_t * vc, MPL_IOV * iov, int n_iov, MPIR_Request ** sreq_ptr) { MPIR_Request *sreq = NULL; MPIDI_CH3I_VC *vcch = &vc->ch; int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_ISTARTMSGV); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3_ISTARTMSGV); MPIR_Assert(n_iov <= MPL_IOV_LIMIT); /* The SOCK channel uses a fixed length header, the size of which is the * maximum of all possible packet headers */ iov[0].MPL_IOV_LEN = sizeof(MPIDI_CH3_Pkt_t); MPL_DBG_STMT(MPIDI_CH3_DBG_CHANNEL, VERBOSE, MPIDI_DBG_Print_packet((MPIDI_CH3_Pkt_t *) iov[0].MPL_IOV_BUF)); if (vcch->state == MPIDI_CH3I_VC_STATE_CONNECTED) { /* MT */ /* Connection already formed. If send queue is empty attempt to send * data, queuing any unsent data. */ if (MPIDI_CH3I_SendQ_empty(vcch)) { /* MT */ int rc; size_t nb; MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, "send queue empty, attempting to write"); MPL_DBG_PKT(vcch->conn, (MPIDI_CH3_Pkt_t *) iov[0].MPL_IOV_BUF, "isend"); /* MT - need some signalling to lock down our right to use the * channel, thus insuring that the progress engine does * also try to write */ rc = MPIDI_CH3I_Sock_writev(vcch->sock, iov, n_iov, &nb); if (rc == MPI_SUCCESS) { int offset = 0; MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL, VERBOSE, "wrote %ld bytes", (unsigned long) nb); while (offset < n_iov) { if (nb >= (int) iov[offset].MPL_IOV_LEN) { nb -= iov[offset].MPL_IOV_LEN; offset++; } else { MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, "partial write, request enqueued at head"); sreq = create_request(iov, n_iov, offset, nb); if (sreq == NULL) { MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); } MPIDI_CH3I_SendQ_enqueue_head(vcch, sreq); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CHANNEL, VERBOSE, (MPL_DBG_FDEST, "posting writev, vc=0x%p, sreq=0x%08x", vc, sreq->handle)); vcch->conn->send_active = sreq; mpi_errno = MPIDI_CH3I_Sock_post_writev(vcch->conn->sock, sreq->dev.iov + offset, sreq->dev.iov_count - offset, NULL); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, __func__, __LINE__, MPI_ERR_OTHER, "**ch3|sock|postwrite", "ch3|sock|postwrite %p %p %p", sreq, vcch->conn, vc); } /* --END ERROR HANDLING-- */ break; } } if (offset == n_iov) { MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, "entire write complete"); } } /* --BEGIN ERROR HANDLING-- */ else { MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL, TYPICAL, "ERROR - MPIDI_CH3I_Sock_writev failed, rc=%d", rc); sreq = MPIR_Request_create(MPIR_REQUEST_KIND__SEND); if (sreq == NULL) { MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); } MPIR_cc_set(&(sreq->cc), 0); sreq->status.MPI_ERROR = MPIR_Err_create_code(rc, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_INTERN, "**ch3|sock|writefailed", "**ch3|sock|writefailed %d", rc); /* Make sure that the caller sees this error */ mpi_errno = sreq->status.MPI_ERROR; } /* --END ERROR HANDLING-- */ } else { MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, "send in progress, request enqueued"); sreq = create_request(iov, n_iov, 0, 0); if (sreq == NULL) { MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); } MPIDI_CH3I_SendQ_enqueue(vcch, sreq); } } else if (vcch->state == MPIDI_CH3I_VC_STATE_CONNECTING) { MPL_DBG_VCUSE(vc, "connecting. enqueuing request"); /* queue the data so it can be sent after the connection is formed */ sreq = create_request(iov, n_iov, 0, 0); if (sreq == NULL) { MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); } MPIDI_CH3I_SendQ_enqueue(vcch, sreq); } else if (vcch->state == MPIDI_CH3I_VC_STATE_UNCONNECTED) { MPL_DBG_VCUSE(vc, "unconnected. posting connect and enqueuing request"); /* queue the data so it can be sent after the connection is formed */ sreq = create_request(iov, n_iov, 0, 0); if (sreq == NULL) { MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); } MPIDI_CH3I_SendQ_enqueue(vcch, sreq); /* Form a new connection */ MPIDI_CH3I_VC_post_connect(vc); } else if (vcch->state != MPIDI_CH3I_VC_STATE_FAILED) { /* Unable to send data at the moment, so queue it for later */ MPL_DBG_VCUSE(vc, "forming connection, request enqueued"); sreq = create_request(iov, n_iov, 0, 0); if (sreq == NULL) { MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); } MPIDI_CH3I_SendQ_enqueue(vcch, sreq); } /* --BEGIN ERROR HANDLING-- */ else { /* Connection failed, so allocate a request and return an error. */ MPL_DBG_VCUSE(vc, "ERROR - connection failed"); sreq = MPIR_Request_create(MPIR_REQUEST_KIND__SEND); if (sreq == NULL) { MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem"); } MPIR_cc_set(&(sreq->cc), 0); sreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_INTERN, "**ch3|sock|connectionfailed", 0); /* Make sure that the caller sees this error */ mpi_errno = sreq->status.MPI_ERROR; } /* --END ERROR HANDLING-- */ fn_fail: *sreq_ptr = sreq; MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3_ISTARTMSGV); return mpi_errno; }
int MPIR_Intercomm_create_impl(MPIR_Comm *local_comm_ptr, int local_leader, MPIR_Comm *peer_comm_ptr, int remote_leader, int tag, MPIR_Comm **new_intercomm_ptr) { int mpi_errno = MPI_SUCCESS; MPIR_Context_id_t final_context_id, recvcontext_id; int remote_size = 0, *remote_lpids = NULL; int comm_info[3]; int is_low_group = 0; int cts_tag; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPIR_COMM_KIND__INTERCOMM_CREATE_IMPL); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPIR_COMM_KIND__INTERCOMM_CREATE_IMPL); /* Shift tag into the tagged coll space (tag provided by the user is ignored as of MPI 3.0) */ cts_tag = MPIR_COMM_KIND__INTERCOMM_CREATE_TAG | MPIR_Process.tagged_coll_mask; mpi_errno = MPID_Intercomm_exchange_map(local_comm_ptr, local_leader, peer_comm_ptr, remote_leader, &remote_size, &remote_lpids, &is_low_group); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* * Create the contexts. Each group will have a context for sending * to the other group. All processes must be involved. Because * we know that the local and remote groups are disjoint, this * step will complete */ MPL_DBG_MSG_FMT(MPIR_DBG_COMM,VERBOSE, (MPL_DBG_FDEST,"About to get contextid (local_size=%d) on rank %d", local_comm_ptr->local_size, local_comm_ptr->rank )); /* In the multi-threaded case, MPIR_Get_contextid_sparse assumes that the calling routine already holds the single criticial section */ /* TODO: Make sure this is tag-safe */ mpi_errno = MPIR_Get_contextid_sparse( local_comm_ptr, &recvcontext_id, FALSE ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Assert(recvcontext_id != 0); MPL_DBG_MSG_FMT(MPIR_DBG_COMM,VERBOSE, (MPL_DBG_FDEST,"Got contextid=%d", recvcontext_id)); /* Leaders can now swap context ids and then broadcast the value to the local group of processes */ if (local_comm_ptr->rank == local_leader) { MPIR_Context_id_t remote_context_id; mpi_errno = MPIC_Sendrecv( &recvcontext_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, remote_leader, cts_tag, &remote_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, remote_leader, cts_tag, peer_comm_ptr, MPI_STATUS_IGNORE, &errflag ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); final_context_id = remote_context_id; /* Now, send all of our local processes the remote_lpids, along with the final context id */ comm_info[0] = final_context_id; MPL_DBG_MSG(MPIR_DBG_COMM,VERBOSE,"About to bcast on local_comm"); mpi_errno = MPID_Bcast( comm_info, 1, MPI_INT, local_leader, local_comm_ptr, &errflag ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); MPL_DBG_MSG_D(MPIR_DBG_COMM,VERBOSE,"end of bcast on local_comm of size %d", local_comm_ptr->local_size ); } else { /* we're the other processes */ MPL_DBG_MSG(MPIR_DBG_COMM,VERBOSE,"About to receive bcast on local_comm"); mpi_errno = MPID_Bcast( comm_info, 1, MPI_INT, local_leader, local_comm_ptr, &errflag ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* Extract the context and group sign informatin */ final_context_id = comm_info[0]; } /* At last, we now have the information that we need to build the intercommunicator */ /* All processes in the local_comm now build the communicator */ mpi_errno = MPIR_Comm_create( new_intercomm_ptr ); if (mpi_errno) goto fn_fail; (*new_intercomm_ptr)->context_id = final_context_id; (*new_intercomm_ptr)->recvcontext_id = recvcontext_id; (*new_intercomm_ptr)->remote_size = remote_size; (*new_intercomm_ptr)->local_size = local_comm_ptr->local_size; (*new_intercomm_ptr)->pof2 = local_comm_ptr->pof2; (*new_intercomm_ptr)->rank = local_comm_ptr->rank; (*new_intercomm_ptr)->comm_kind = MPIR_COMM_KIND__INTERCOMM; (*new_intercomm_ptr)->local_comm = 0; (*new_intercomm_ptr)->is_low_group = is_low_group; mpi_errno = MPID_Create_intercomm_from_lpids( *new_intercomm_ptr, remote_size, remote_lpids ); if (mpi_errno) goto fn_fail; MPIR_Comm_map_dup(*new_intercomm_ptr, local_comm_ptr, MPIR_COMM_MAP_DIR__L2L); /* Inherit the error handler (if any) */ MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_COMM_MUTEX(local_comm_ptr)); (*new_intercomm_ptr)->errhandler = local_comm_ptr->errhandler; if (local_comm_ptr->errhandler) { MPIR_Errhandler_add_ref( local_comm_ptr->errhandler ); } MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_COMM_MUTEX(local_comm_ptr)); mpi_errno = MPIR_Comm_commit(*new_intercomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: if (remote_lpids) { MPL_free(remote_lpids); remote_lpids = NULL; } MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPIR_COMM_KIND__INTERCOMM_CREATE_IMPL); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3U_Handle_unordered_recv_pkt(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t * pkt, MPIR_Request ** rreqp) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_HANDLE_UNORDERED_RECV_PKT); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_HANDLE_UNORDERED_RECV_PKT); /* FIXME: This should probably be *rreqp = NULL? */ rreqp = NULL; switch(pkt->type) { case MPIDI_CH3_PKT_EAGER_SEND: case MPIDI_CH3_PKT_EAGER_SYNC_SEND: case MPIDI_CH3_PKT_READY_SEND: case MPIDI_CH3_PKT_RNDV_REQ_TO_SEND: { MPIDI_CH3_Pkt_send_t * send_pkt = (MPIDI_CH3_Pkt_send_t *) pkt; MPIDI_CH3_Pkt_send_container_t * pc_cur; MPIDI_CH3_Pkt_send_container_t * pc_last; MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE, "received (potentially) out-of-order send pkt"); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST, "rank=%d, tag=%d, context=%d seqnum=%d", send_pkt->match.rank, send_pkt->match.tag, send_pkt->match.context_id, send_pkt->seqnum)); MPL_DBG_MSG_FMAT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST, "vc - seqnum_send=%d seqnum_recv=%d reorder_msg_queue=0x%08lx", vc->seqnum_send, vc->seqnum_recv, (unsigned long) vc->msg_reorder_queue)); if (send_pkt->seqnum == vc->seqnum_recv) { mpi_errno = MPIDI_CH3U_Handle_ordered_recv_pkt(vc, pkt, rreqp); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { goto fn_exit; } /* --END ERROR HANDLING-- */ vc->seqnum_recv++; pc_cur = vc->msg_reorder_queue; while(pc_cur != NULL && vc->seqnum_recv == pc_cur->pkt.seqnum) { pkt = (MPIDI_CH3_Pkt_t *) &pc_cur->pkt; mpi_errno = MPIDI_CH3U_Handle_ordered_recv_pkt(vc, pkt, rreqp); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, __func__, __LINE__, MPI_ERR_OTHER, "**ch3|pktordered", 0); goto fn_exit; } /* --END ERROR HANDLING-- */ vc->seqnum_recv++; pc_last = pc_cur; pc_cur = pc_cur->next; MPIDI_CH3U_Pkt_send_container_free(pc_last); } vc->msg_reorder_queue = pc_cur; } else { MPIDI_CH3_Pkt_send_container_t * pc_new; /* allocate container and copy packet */ pc_new = MPIDI_CH3U_Pkt_send_container_alloc(); /* --BEGIN ERROR HANDLING-- */ if (pc_new == NULL) { mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, __func__, __LINE__, MPI_ERR_OTHER, "**ch3|nopktcontainermem", 0); goto fn_exit; } /* --END ERROR HANDLING-- */ pc_new->pkt = *send_pkt; /* insert packet into reorder queue */ pc_last = NULL; pc_cur = vc->msg_reorder_queue; while (pc_cur != NULL) { /* the current recv seqnum is subtracted from both the seqnums prior to comparision so as to remove any wrap around effects. */ if (pc_new->pkt.seqnum - vc->seqnum_recv < pc_cur->pkt.seqnum - vc->seqnum_recv) { break; } pc_last = pc_cur; pc_cur = pc_cur->next; } if (pc_last == NULL) { pc_new->next = pc_cur; vc->msg_reorder_queue = pc_new; } else { pc_new->next = pc_cur; pc_last->next = pc_new; } } break; } case MPIDI_CH3_PKT_CANCEL_SEND_REQ: { /* --BEGIN ERROR HANDLING-- */ /* FIXME: processing send cancel requests requires that we be aware of pkts in the reorder queue */ mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER, "**ch3|ooocancelreq", 0); goto fn_exit; break; /* --END ERROR HANDLING-- */ } default: { mpi_errno = MPIDI_CH3U_Handle_ordered_recv_pkt(vc, pkt, rreqp); break; } } fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_HANDLE_UNORDERED_RECV_PKT); return mpi_errno; }
int MPID_Send(const void * buf, MPI_Aint count, MPI_Datatype datatype, int rank, int tag, MPIR_Comm * comm, int context_offset, MPIR_Request ** request) { intptr_t data_sz; int dt_contig; MPI_Aint dt_true_lb; MPIR_Datatype* dt_ptr; MPIR_Request * sreq = NULL; MPIDI_VC_t * vc; #if defined(MPID_USE_SEQUENCE_NUMBERS) MPID_Seqnum_t seqnum; #endif int eager_threshold = -1; int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_SEND); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_SEND); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST, "rank=%d, tag=%d, context=%d", rank, tag, comm->context_id + context_offset)); /* Check to make sure the communicator hasn't already been revoked */ if (comm->revoked && MPIR_AGREE_TAG != MPIR_TAG_MASK_ERROR_BITS(tag & ~MPIR_Process.tagged_coll_mask) && MPIR_SHRINK_TAG != MPIR_TAG_MASK_ERROR_BITS(tag & ~MPIR_Process.tagged_coll_mask)) { MPIR_ERR_SETANDJUMP(mpi_errno,MPIX_ERR_REVOKED,"**revoked"); } if (rank == comm->rank && comm->comm_kind != MPIR_COMM_KIND__INTERCOMM) { mpi_errno = MPIDI_Isend_self(buf, count, datatype, rank, tag, comm, context_offset, MPIDI_REQUEST_TYPE_SEND, &sreq); /* In the single threaded case, sending to yourself will cause deadlock. Note that in the runtime-thread case, this check will not be made (long-term FIXME) */ # ifndef MPICH_IS_THREADED { if (sreq != NULL && MPIR_cc_get(sreq->cc) != 0) { MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**dev|selfsenddeadlock"); } } # endif if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } goto fn_exit; } if (rank == MPI_PROC_NULL) { goto fn_exit; } MPIDI_Comm_get_vc_set_active(comm, rank, &vc); MPIR_ERR_CHKANDJUMP1(vc->state == MPIDI_VC_STATE_MORIBUND, mpi_errno, MPIX_ERR_PROC_FAILED, "**comm_fail", "**comm_fail %d", rank); #ifdef ENABLE_COMM_OVERRIDES if (vc->comm_ops && vc->comm_ops->send) { mpi_errno = vc->comm_ops->send( vc, buf, count, datatype, rank, tag, comm, context_offset, &sreq); goto fn_exit; } #endif MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); if (data_sz == 0) { MPIDI_CH3_Pkt_t upkt; MPIDI_CH3_Pkt_eager_send_t * const eager_pkt = &upkt.eager_send; MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"sending zero length message"); MPIDI_Pkt_init(eager_pkt, MPIDI_CH3_PKT_EAGER_SEND); eager_pkt->match.parts.rank = comm->rank; eager_pkt->match.parts.tag = tag; eager_pkt->match.parts.context_id = comm->context_id + context_offset; eager_pkt->sender_req_id = MPI_REQUEST_NULL; eager_pkt->data_sz = 0; MPIDI_VC_FAI_send_seqnum(vc, seqnum); MPIDI_Pkt_set_seqnum(eager_pkt, seqnum); MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex); mpi_errno = MPIDI_CH3_iStartMsg(vc, eager_pkt, sizeof(*eager_pkt), &sreq); MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|eagermsg"); } /* --END ERROR HANDLING-- */ if (sreq != NULL) { MPIDI_Request_set_seqnum(sreq, seqnum); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); /* sreq->comm = comm; MPIR_Comm_add_ref(comm); -- not necessary for blocking functions */ } goto fn_exit; } MPIDI_CH3_GET_EAGER_THRESHOLD(&eager_threshold, comm, vc); /* FIXME: flow control: limit number of outstanding eager messages containing data and need to be buffered by the receiver */ #ifdef USE_EAGER_SHORT if (dt_contig && data_sz <= MPIDI_EAGER_SHORT_SIZE) { mpi_errno = MPIDI_CH3_EagerContigShortSend( &sreq, MPIDI_CH3_PKT_EAGERSHORT_SEND, (char *)buf + dt_true_lb, data_sz, rank, tag, comm, context_offset ); } else #endif if (data_sz + sizeof(MPIDI_CH3_Pkt_eager_send_t) <= eager_threshold) { if (dt_contig) { mpi_errno = MPIDI_CH3_EagerContigSend( &sreq, MPIDI_CH3_PKT_EAGER_SEND, (char *)buf + dt_true_lb, data_sz, rank, tag, comm, context_offset ); } else { MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit); MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND); mpi_errno = MPIDI_CH3_EagerNoncontigSend( &sreq, MPIDI_CH3_PKT_EAGER_SEND, buf, count, datatype, data_sz, rank, tag, comm, context_offset ); } } else {
/* MPIDI_CH3I_SendNoncontig - Sends a message by packing directly into cells. The caller must initialize sreq->dev.segment as well as segment_first and segment_size. */ int MPIDI_CH3I_SendNoncontig( MPIDI_VC_t *vc, MPIR_Request *sreq, void *header, intptr_t hdr_sz, MPL_IOV *hdr_iov, int n_hdr_iov) { int mpi_errno = MPI_SUCCESS; int again = 0; intptr_t orig_segment_first = sreq->dev.segment_first; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_SENDNONCONTIG); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3I_SENDNONCONTIG); MPIDI_DBG_Print_packet((MPIDI_CH3_Pkt_t *)header); MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); if (n_hdr_iov > 0) { /* translate segments to iovs and combine with the extended header iov. */ mpi_errno = MPIDI_CH3_SendNoncontig_iov(vc, sreq, header, hdr_sz, hdr_iov, n_hdr_iov); if (mpi_errno) MPIR_ERR_POP(mpi_errno); goto fn_exit; } if (!MPIDI_CH3I_Sendq_empty(MPIDI_CH3I_shm_sendq)) /* MT */ { /* send queue is not empty, enqueue the request then check to see if we can send any now */ MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER, TERSE, "enqueuing"); sreq->dev.pending_pkt = *(MPIDI_CH3_Pkt_t *)header; sreq->ch.noncontig = TRUE; sreq->ch.header_sz = hdr_sz; sreq->ch.vc = vc; MPIDI_CH3I_Sendq_enqueue(&MPIDI_CH3I_shm_sendq, sreq); mpi_errno = MPIDI_CH3I_Shm_send_progress(); if (mpi_errno) MPIR_ERR_POP(mpi_errno); goto fn_exit; } /* send as many cells of data as you can */ MPID_nem_mpich_send_seg_header(sreq->dev.segment_ptr, &sreq->dev.segment_first, sreq->dev.segment_size, header, hdr_sz, vc, &again); while(!again && sreq->dev.segment_first < sreq->dev.segment_size) MPID_nem_mpich_send_seg(sreq->dev.segment_ptr, &sreq->dev.segment_first, sreq->dev.segment_size, vc, &again); if (again) { /* we didn't finish sending everything */ sreq->ch.noncontig = TRUE; sreq->ch.vc = vc; if (sreq->dev.segment_first == orig_segment_first) /* nothing was sent, save header */ { sreq->dev.pending_pkt = *(MPIDI_CH3_Pkt_t *)header; sreq->ch.header_sz = hdr_sz; } else { /* part of message was sent, make this req an active send */ MPIR_Assert(MPIDI_CH3I_shm_active_send == NULL); MPIDI_CH3I_shm_active_send = sreq; } MPIDI_CH3I_Sendq_enqueue(&MPIDI_CH3I_shm_sendq, sreq); goto fn_exit; } /* finished sending all data, complete the request */ if (!sreq->dev.OnDataAvail) { MPIR_Assert(MPIDI_Request_get_type(sreq) != MPIDI_REQUEST_TYPE_GET_RESP); mpi_errno = MPID_Request_complete(sreq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete %d bytes", (int) (sreq->dev.segment_size)); } else { int complete = 0; mpi_errno = sreq->dev.OnDataAvail(vc, sreq, &complete); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Assert(complete); /* all data has been sent, we should always complete */ MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete %d bytes", (int) (sreq->dev.segment_size)); } fn_exit: MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3I_SENDNONCONTIG); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3_PktPrint_Close( FILE *fp, MPIDI_CH3_Pkt_t *pkt ) { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," type ......... MPIDI_CH3_PKT_CLOSE\n"); MPL_DBG_MSG_S(MPIDI_CH3_DBG_OTHER,TERSE," ack ......... %s\n", pkt->close.ack ? "TRUE" : "FALSE"); return MPI_SUCCESS; }
int MPIR_Get_contextid_sparse_group(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, int tag, MPIR_Context_id_t * context_id, int ignore_id) { int mpi_errno = MPI_SUCCESS; MPIR_Errflag_t errflag = MPIR_ERR_NONE; struct gcn_state st; struct gcn_state *tmp; MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPIR_GET_CONTEXTID); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPIR_GET_CONTEXTID); st.first_iter = 1; st.comm_ptr = comm_ptr; st.tag = tag; st.own_mask = 0; st.own_eager_mask = 0; /* Group-collective and ignore_id should never be combined */ MPIR_Assert(!(group_ptr != NULL && ignore_id)); *context_id = 0; MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, "Entering; shared state is %d:%d, my ctx id is %d, tag=%d", mask_in_use, eager_in_use, comm_ptr->context_id, tag)); while (*context_id == 0) { /* We lock only around access to the mask (except in the global locking * case). If another thread is using the mask, we take a mask of zero. */ MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX); if (initialize_context_mask) { context_id_init(); } if (eager_nelem < 0) { /* Ensure that at least one word of deadlock-free context IDs is * always set aside for the base protocol */ MPIR_Assert(MPIR_CVAR_CTXID_EAGER_SIZE >= 0 && MPIR_CVAR_CTXID_EAGER_SIZE < MPIR_MAX_CONTEXT_MASK - 1); eager_nelem = MPIR_CVAR_CTXID_EAGER_SIZE; } if (ignore_id) { /* We are not participating in the resulting communicator, so our * context ID space doesn't matter. Set the mask to "all available". */ memset(st.local_mask, 0xff, MPIR_MAX_CONTEXT_MASK * sizeof(int)); st.own_mask = 0; /* don't need to touch mask_in_use/lowest_context_id b/c our thread * doesn't ever need to "win" the mask */ } /* Deadlock avoidance: Only participate in context id loop when all * processes have called this routine. On the first iteration, use the * "eager" allocation protocol. */ else if (st.first_iter) { memset(st.local_mask, 0, MPIR_MAX_CONTEXT_MASK * sizeof(int)); st.own_eager_mask = 0; /* Attempt to reserve the eager mask segment */ if (!eager_in_use && eager_nelem > 0) { int i; for (i = 0; i < eager_nelem; i++) st.local_mask[i] = context_mask[i]; eager_in_use = 1; st.own_eager_mask = 1; } } else { MPIR_Assert(next_gcn != NULL); /*If we are here, at least one element must be in the list, at least myself */ /* only the first element in the list can own the mask. However, maybe the mask is used * by another thread, which added another allcoation to the list bevore. So we have to check, * if the mask is used and mark, if we own it */ if (mask_in_use || &st != next_gcn) { memset(st.local_mask, 0, MPIR_MAX_CONTEXT_MASK * sizeof(int)); st.own_mask = 0; MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST, "Mask is in use, my context_id is %d, owner context id is %d", st.comm_ptr->context_id, next_gcn->comm_ptr->context_id)); } else { int i; /* Copy safe mask segment to local_mask */ for (i = 0; i < eager_nelem; i++) st.local_mask[i] = 0; for (i = eager_nelem; i < MPIR_MAX_CONTEXT_MASK; i++) st.local_mask[i] = context_mask[i]; mask_in_use = 1; st.own_mask = 1; MPL_DBG_MSG(MPIR_DBG_COMM, VERBOSE, "Copied local_mask"); } } MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX); /* Note: MPIR_MAX_CONTEXT_MASK elements of local_mask are used by the * context ID allocation algorithm. The additional element is ignored * by the context ID mask access routines and is used as a flag for * detecting context ID exhaustion (explained below). */ if (st.own_mask || ignore_id) st.local_mask[ALL_OWN_MASK_FLAG] = 1; else st.local_mask[ALL_OWN_MASK_FLAG] = 0; /* Now, try to get a context id */ MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM); /* In the global and brief-global cases, note that this routine will * release that global lock when it needs to wait. That will allow * other processes to enter the global or brief global critical section. */ if (group_ptr != NULL) { int coll_tag = tag | MPIR_Process.tagged_coll_mask; /* Shift tag into the tagged coll space */ mpi_errno = MPIR_Allreduce_group(MPI_IN_PLACE, st.local_mask, MPIR_MAX_CONTEXT_MASK + 1, MPI_INT, MPI_BAND, comm_ptr, group_ptr, coll_tag, &errflag); } else { mpi_errno = MPID_Allreduce(MPI_IN_PLACE, st.local_mask, MPIR_MAX_CONTEXT_MASK + 1, MPI_INT, MPI_BAND, comm_ptr, &errflag); } if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* MT FIXME 2/3 cases don't seem to need the CONTEXTID CS, check and * narrow this region */ MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX); if (ignore_id) { /* we don't care what the value was, but make sure that everyone * who did care agreed on a value */ *context_id = locate_context_bit(st.local_mask); /* used later in out-of-context ids check and outer while loop condition */ } else if (st.own_eager_mask) { /* There is a chance that we've found a context id */ /* Find_and_allocate_context_id updates the context_mask if it finds a match */ *context_id = find_and_allocate_context_id(st.local_mask); MPL_DBG_MSG_D(MPIR_DBG_COMM, VERBOSE, "Context id is now %hd", *context_id); st.own_eager_mask = 0; eager_in_use = 0; if (*context_id <= 0) { /* else we did not find a context id. Give up the mask in case * there is another thread (with a lower input context id) * waiting for it. We need to ensure that any other threads * have the opportunity to run, hence yielding */ /* FIXME: Do we need to do an GLOBAL yield here? * When we do a collective operation, we anyway yield * for other others */ MPID_THREAD_CS_YIELD(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); MPID_THREAD_CS_YIELD(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX); } } else if (st.own_mask) { /* There is a chance that we've found a context id */ /* Find_and_allocate_context_id updates the context_mask if it finds a match */ *context_id = find_and_allocate_context_id(st.local_mask); MPL_DBG_MSG_D(MPIR_DBG_COMM, VERBOSE, "Context id is now %hd", *context_id); mask_in_use = 0; if (*context_id > 0) { /* If we found a new context id, we have to remove the element from the list, so the * next allocation can own the mask */ if (next_gcn == &st) { next_gcn = st.next; } else { for (tmp = next_gcn; tmp->next != &st; tmp = tmp->next); /* avoid compiler warnings */ tmp->next = st.next; } } else { /* else we did not find a context id. Give up the mask in case * there is another thread in the gcn_next_list * waiting for it. We need to ensure that any other threads * have the opportunity to run, hence yielding */ /* FIXME: Do we need to do an GLOBAL yield here? * When we do a collective operation, we anyway yield * for other others */ MPID_THREAD_CS_YIELD(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); MPID_THREAD_CS_YIELD(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX); } } else { /* As above, force this thread to yield */ /* FIXME: Do we need to do an GLOBAL yield here? When we * do a collective operation, we anyway yield for other * others */ MPID_THREAD_CS_YIELD(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); MPID_THREAD_CS_YIELD(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX); } MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX); /* Test for context ID exhaustion: All threads that will participate in * the new communicator owned the mask and could not allocate a context * ID. This indicates that either some process has no context IDs * available, or that some are available, but the allocation cannot * succeed because there is no common context ID. */ if (*context_id == 0 && st.local_mask[ALL_OWN_MASK_FLAG] == 1) { /* --BEGIN ERROR HANDLING-- */ int nfree = 0; int ntotal = 0; int minfree; if (st.own_mask) { MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX); mask_in_use = 0; MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX); } context_mask_stats(&nfree, &ntotal); if (ignore_id) minfree = INT_MAX; else minfree = nfree; if (group_ptr != NULL) { int coll_tag = tag | MPIR_Process.tagged_coll_mask; /* Shift tag into the tagged coll space */ mpi_errno = MPIR_Allreduce_group(MPI_IN_PLACE, &minfree, 1, MPI_INT, MPI_MIN, comm_ptr, group_ptr, coll_tag, &errflag); } else { mpi_errno = MPID_Allreduce(MPI_IN_PLACE, &minfree, 1, MPI_INT, MPI_MIN, comm_ptr, &errflag); } if (minfree > 0) { MPIR_ERR_SETANDJUMP3(mpi_errno, MPI_ERR_OTHER, "**toomanycommfrag", "**toomanycommfrag %d %d %d", nfree, ntotal, ignore_id); } else { MPIR_ERR_SETANDJUMP3(mpi_errno, MPI_ERR_OTHER, "**toomanycomm", "**toomanycomm %d %d %d", nfree, ntotal, ignore_id); } /* --END ERROR HANDLING-- */ } if (st.first_iter == 1) { st.first_iter = 0; /* to avoid deadlocks, the element is not added to the list bevore the first iteration */ if (!ignore_id && *context_id == 0) { MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX); add_gcn_to_list(&st); MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX); } } } fn_exit: if (ignore_id) *context_id = MPIR_INVALID_CONTEXT_ID; MPL_DBG_MSG_S(MPIR_DBG_COMM, VERBOSE, "Context mask = %s", context_mask_to_str()); MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPIR_GET_CONTEXTID); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: /* Release the masks */ MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX); if (st.own_mask) { mask_in_use = 0; } /*If in list, remove it */ if (!st.first_iter && !ignore_id) { if (next_gcn == &st) { next_gcn = st.next; } else { for (tmp = next_gcn; tmp->next != &st; tmp = tmp->next); tmp->next = st.next; } } MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3U_Receive_data_found(MPIR_Request *rreq, void *buf, intptr_t *buflen, int *complete) { int dt_contig; MPI_Aint dt_true_lb; intptr_t userbuf_sz; MPIR_Datatype * dt_ptr = NULL; intptr_t data_sz; int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_FOUND); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_FOUND); MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"posted request found"); MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, userbuf_sz, dt_ptr, dt_true_lb); if (rreq->dev.recv_data_sz <= userbuf_sz) { data_sz = rreq->dev.recv_data_sz; } else { MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST, "receive buffer too small; message truncated, msg_sz=%" PRIdPTR ", userbuf_sz=%" PRIdPTR, rreq->dev.recv_data_sz, userbuf_sz)); rreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TRUNCATE, "**truncate", "**truncate %d %d %d %d", rreq->status.MPI_SOURCE, rreq->status.MPI_TAG, rreq->dev.recv_data_sz, userbuf_sz ); MPIR_STATUS_SET_COUNT(rreq->status, userbuf_sz); data_sz = userbuf_sz; } if (dt_contig && data_sz == rreq->dev.recv_data_sz) { /* user buffer is contiguous and large enough to store the entire message. However, we haven't yet *read* the data (this code describes how to read the data into the destination) */ /* if all of the data has already been received, unpack it now, otherwise build an iov and let the channel unpack */ if (*buflen >= data_sz) { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"Copying contiguous data to user buffer"); /* copy data out of the receive buffer */ if (rreq->dev.drop_data == FALSE) { MPIR_Memcpy((char*)(rreq->dev.user_buf) + dt_true_lb, buf, data_sz); } *buflen = data_sz; *complete = TRUE; } else { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"IOV loaded for contiguous read"); rreq->dev.iov[0].MPL_IOV_BUF = (MPL_IOV_BUF_CAST)((char*)(rreq->dev.user_buf) + dt_true_lb); rreq->dev.iov[0].MPL_IOV_LEN = data_sz; rreq->dev.iov_count = 1; *buflen = 0; *complete = FALSE; } /* Trigger OnFinal when receiving the last segment */ rreq->dev.OnDataAvail = rreq->dev.OnFinal; } else { /* user buffer is not contiguous or is too small to hold the entire message */ rreq->dev.segment_ptr = MPIR_Segment_alloc( ); MPIR_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPIR_Segment_alloc"); MPIR_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, rreq->dev.datatype, rreq->dev.segment_ptr); rreq->dev.segment_first = 0; rreq->dev.segment_size = data_sz; /* if all of the data has already been received, and the message is not truncated, unpack it now, otherwise build an iov and let the channel unpack */ if (data_sz == rreq->dev.recv_data_sz && *buflen >= data_sz) { intptr_t last; MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"Copying noncontiguous data to user buffer"); last = data_sz; MPIR_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, buf); /* --BEGIN ERROR HANDLING-- */ if (last != data_sz) { /* If the data can't be unpacked, the we have a mismatch between the datatype and the amount of data received. Throw away received data. */ MPIR_ERR_SET(rreq->status.MPI_ERROR, MPI_ERR_TYPE, "**dtypemismatch"); MPIR_STATUS_SET_COUNT(rreq->status, rreq->dev.segment_first); *buflen = data_sz; *complete = TRUE; /* FIXME: Set OnDataAvail to 0? If not, why not? */ goto fn_exit; } /* --END ERROR HANDLING-- */ *buflen = data_sz; /* Trigger OnFinal when receiving the last segment */ rreq->dev.OnDataAvail = rreq->dev.OnFinal; *complete = TRUE; } else { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"IOV loaded for non-contiguous read"); mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_SETFATALANDJUMP(mpi_errno,MPI_ERR_OTHER, "**ch3|loadrecviov"); } *buflen = 0; *complete = FALSE; } } fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_FOUND); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3U_Get_failed_group(int last_rank, MPIR_Group **failed_group) { char *c; int i, mpi_errno = MPI_SUCCESS, rank; UT_array *failed_procs = NULL; MPIR_Group *world_group; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_GET_FAILED_GROUP); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_GET_FAILED_GROUP); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER, VERBOSE, "Getting failed group with %d as last acknowledged\n", last_rank); if (-1 == last_rank) { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER, VERBOSE, "No failure acknowledged"); *failed_group = MPIR_Group_empty; goto fn_exit; } if (*MPIDI_failed_procs_string == '\0') { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER, VERBOSE, "Found no failed ranks"); *failed_group = MPIR_Group_empty; goto fn_exit; } utarray_new(failed_procs, &ut_int_icd); /* parse list of failed processes. This is a comma separated list of ranks or ranges of ranks (e.g., "1, 3-5, 11") */ i = 0; c = MPIDI_failed_procs_string; while(1) { parse_rank(&rank); ++i; MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER, VERBOSE, "Found failed rank: %d", rank); utarray_push_back(failed_procs, &rank); MPIDI_last_known_failed = rank; MPIR_ERR_CHKINTERNAL(*c != ',' && *c != '\0', mpi_errno, "error parsing failed process list"); if (*c == '\0' || last_rank == rank) break; ++c; /* skip ',' */ } /* Create group of failed processes for comm_world. Failed groups for other communicators can be created from this one using group_intersection. */ mpi_errno = MPIR_Comm_group_impl(MPIR_Process.comm_world, &world_group); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Group_incl_impl(world_group, i, ut_int_array(failed_procs), failed_group); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Group_release(world_group); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_GET_FAILED_GROUP); if (failed_procs) utarray_free(failed_procs); return mpi_errno; fn_oom: MPIR_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "utarray"); fn_fail: goto fn_exit; }
void MPIDI_DBG_Print_packet(MPIDI_CH3_Pkt_t * pkt) { { MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE,"MPIDI_CH3_Pkt_t:\n"); switch (pkt->type) { case MPIDI_CH3_PKT_EAGER_SEND: MPIDI_CH3_PktPrint_EagerSend(stdout, pkt); break; case MPIDI_CH3_PKT_READY_SEND: MPIDI_CH3_PktPrint_ReadySend(stdout, pkt); break; case MPIDI_CH3_PKT_EAGER_SYNC_SEND: MPIDI_CH3_PktPrint_EagerSyncSend(stdout, pkt); break; case MPIDI_CH3_PKT_EAGER_SYNC_ACK: MPIDI_CH3_PktPrint_EagerSyncAck(stdout, pkt); break; case MPIDI_CH3_PKT_RNDV_REQ_TO_SEND: MPIDI_CH3_PktPrint_RndvReqToSend(stdout, pkt); break; case MPIDI_CH3_PKT_RNDV_CLR_TO_SEND: MPIDI_CH3_PktPrint_RndvClrToSend(stdout, pkt); break; case MPIDI_CH3_PKT_RNDV_SEND: MPIDI_CH3_PktPrint_RndvSend(stdout, pkt); break; case MPIDI_CH3_PKT_CANCEL_SEND_REQ: MPIDI_CH3_PktPrint_CancelSendReq(stdout, pkt); break; case MPIDI_CH3_PKT_CANCEL_SEND_RESP: MPIDI_CH3_PktPrint_CancelSendResp(stdout, pkt); break; /* FIXME: Move these RMA descriptions into the RMA code files */ case MPIDI_CH3_PKT_PUT: MPIDI_CH3_PktPrint_Put(stdout, pkt); break; case MPIDI_CH3_PKT_GET: MPIDI_CH3_PktPrint_Get(stdout, pkt); break; case MPIDI_CH3_PKT_GET_RESP: MPIDI_CH3_PktPrint_GetResp(stdout, pkt); break; case MPIDI_CH3_PKT_ACCUMULATE: MPIDI_CH3_PktPrint_Accumulate(stdout, pkt); break; case MPIDI_CH3_PKT_LOCK: MPIDI_CH3_PktPrint_Lock(stdout, pkt); break; case MPIDI_CH3_PKT_ACK: MPIDI_CH3_PktPrint_Ack(stdout, pkt); break; case MPIDI_CH3_PKT_LOCK_ACK: MPIDI_CH3_PktPrint_LockAck(stdout, pkt); break; /* * case MPIDI_CH3_PKT_SHARED_LOCK_OPS_DONE: * MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," type ......... MPIDI_CH3_PKT_SHARED_LOCK_OPS_DONE\n"); * MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," source ....... 0x%08X\n", pkt->shared_lock_ops_done.source_win_handle)); * break; */ case MPIDI_CH3_PKT_FLOW_CNTL_UPDATE: MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," FLOW_CNTRL_UPDATE\n"); break; case MPIDI_CH3_PKT_CLOSE: MPIDI_CH3_PktPrint_Close(stdout, pkt); break; default: MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," INVALID PACKET\n"); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," unknown type ... %d\n", pkt->type); MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," type .......... EAGER_SEND\n"); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," sender_reqid . 0x%08X\n", pkt->eager_send.sender_req_id)); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," context_id ... %d\n", pkt->eager_send.match.parts.context_id); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," data_sz ...... %d\n", pkt->eager_send.data_sz); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," tag .......... %d\n", pkt->eager_send.match.parts.tag); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," rank ......... %d\n", pkt->eager_send.match.parts.rank); #ifdef MPID_USE_SEQUENCE_NUMBERS MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," seqnum ....... %d\n", pkt->eager_send.seqnum); #endif MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," type .......... REQ_TO_SEND\n"); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," sender_reqid . 0x%08X\n", pkt->rndv_req_to_send.sender_req_id)); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," context_id ... %d\n", pkt->rndv_req_to_send.match.parts.context_id); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," data_sz ...... %d\n", pkt->rndv_req_to_send.data_sz); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," tag .......... %d\n", pkt->rndv_req_to_send.match.parts.tag); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," rank ......... %d\n", pkt->rndv_req_to_send.match.parts.rank); #ifdef MPID_USE_SEQUENCE_NUMBERS MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," seqnum ....... %d\n", pkt->rndv_req_to_send.seqnum); #endif MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," type .......... CLR_TO_SEND\n"); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," sender_reqid . 0x%08X\n", pkt->rndv_clr_to_send.sender_req_id)); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," recvr_reqid .. 0x%08X\n", pkt->rndv_clr_to_send.receiver_req_id)); MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," type .......... RNDV_SEND\n"); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," recvr_reqid .. 0x%08X\n", pkt->rndv_send.receiver_req_id)); MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," type .......... CANCEL_SEND\n"); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," context_id ... %d\n", pkt->cancel_send_req.match.parts.context_id); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," tag .......... %d\n", pkt->cancel_send_req.match.parts.tag); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," rank ......... %d\n", pkt->cancel_send_req.match.parts.rank); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," sender_reqid . 0x%08X\n", pkt->cancel_send_req.sender_req_id)); MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," type .......... CANCEL_SEND_RESP\n"); MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," sender_reqid . 0x%08X\n", pkt->cancel_send_resp.sender_req_id)); MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," ack .......... %d\n", pkt->cancel_send_resp.ack); break; } } }
/* XXX DJG FIXME at some point this should poll, much like the newtcp module. But then we have that whole pollfd array to manage, which we don't really need until this proof-of-concept proves itself. */ int MPID_nem_lmt_vmsplice_progress(void) { int mpi_errno = MPI_SUCCESS; struct lmt_vmsplice_node *prev = NULL; struct lmt_vmsplice_node *free_me = NULL; struct lmt_vmsplice_node *cur = outstanding_head; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_LMT_VMSPLICE_PROGRESS); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_LMT_VMSPLICE_PROGRESS); while (cur) { int complete = 0; switch (MPIDI_Request_get_type(cur->req)) { case MPIDI_REQUEST_TYPE_RECV: mpi_errno = do_readv(cur->req, cur->pipe_fd, cur->req->dev.iov, &cur->req->dev.iov_offset, &cur->req->dev.iov_count, &complete); /* FIXME: set the error status of the req and complete it, rather than POP */ if (mpi_errno) MPIR_ERR_POP(mpi_errno); break; case MPIDI_REQUEST_TYPE_SEND: mpi_errno = do_vmsplice(cur->req, cur->pipe_fd, cur->req->dev.iov, &cur->req->dev.iov_offset, &cur->req->dev.iov_count, &complete); /* FIXME: set the error status of the req and complete it, rather than POP */ if (mpi_errno) MPIR_ERR_POP(mpi_errno); break; default: MPIR_ERR_INTERNALANDJUMP(mpi_errno, "unexpected request type"); break; } if (complete) { MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete"); /* remove the node from the list */ if (cur == outstanding_head) { outstanding_head = cur->next; prev = NULL; free_me = cur; cur = cur->next; } else { prev->next = cur->next; prev = cur; free_me = cur; cur = cur->next; } if (free_me) MPL_free(free_me); --MPID_nem_local_lmt_pending; } if (!cur) break; /* we might have made cur NULL above */ prev = cur; cur = cur->next; } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_LMT_VMSPLICE_PROGRESS); return mpi_errno; fn_fail: goto fn_exit; }
/*@ MPIDI_CH3U_Handle_connection - handle connection event Input Parameters: + vc - virtual connection . event - connection event NOTE: This routine is used to transition the VC state. The only events currently handled are TERMINATED events. This routine should be called (with TERMINATED) whenever a connection is terminated whether normally (in MPIDI_CH3_Connection_terminate() ), or abnormally. FIXME: Currently state transitions resulting from receiving CLOSE packets are performed in MPIDI_CH3_PktHandler_Close(). Perhaps that should move here. @*/ int MPIDI_CH3U_Handle_connection(MPIDI_VC_t * vc, MPIDI_VC_Event_t event) { int inuse; int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_HANDLE_CONNECTION); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_HANDLE_CONNECTION); switch (event) { case MPIDI_VC_EVENT_TERMINATED: { switch (vc->state) { case MPIDI_VC_STATE_CLOSED: /* Normal termination. */ MPIDI_CHANGE_VC_STATE(vc, INACTIVE); /* MT: this is not thread safe */ MPIDI_Outstanding_close_ops -= 1; MPL_DBG_MSG_D(MPIDI_CH3_DBG_DISCONNECT,TYPICAL, "outstanding close operations = %d", MPIDI_Outstanding_close_ops); if (MPIDI_Outstanding_close_ops == 0) { MPIDI_CH3_Progress_signal_completion(); mpi_errno = MPIDI_CH3_Channel_close(); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } break; case MPIDI_VC_STATE_INACTIVE: /* VC was terminated before it was activated. This can happen if a failed process was detected before the process used the VC. */ MPL_DBG_MSG(MPIDI_CH3_DBG_DISCONNECT,TYPICAL, "VC terminated before it was activated. We probably got a failed" " process notification."); MPIDI_CH3U_Complete_posted_with_error(vc); ++MPIDI_Failed_vc_count; MPIDI_CHANGE_VC_STATE(vc, MORIBUND); break; case MPIDI_VC_STATE_ACTIVE: case MPIDI_VC_STATE_REMOTE_CLOSE: /* This is a premature termination. This process has not started the close protocol. There may be outstanding sends or receives on the local side, remote side or both. */ MPL_DBG_MSG(MPIDI_CH3_DBG_DISCONNECT,TYPICAL, "Connection closed prematurely."); MPIDI_CH3U_Complete_posted_with_error(vc); ++MPIDI_Failed_vc_count; MPIDU_Ftb_publish_vc(MPIDU_FTB_EV_UNREACHABLE, vc); MPIDI_CHANGE_VC_STATE(vc, MORIBUND); break; case MPIDI_VC_STATE_LOCAL_CLOSE: /* This is a premature termination. This process has started the close protocol, but hasn't received a CLOSE packet from the remote side. This process may not have been able to send the CLOSE ack=F packet to the remote side. There may be outstanding sends or receives on the local or remote sides. */ case MPIDI_VC_STATE_CLOSE_ACKED: /* This is a premature termination. Both sides have started the close protocol. This process has received CLOSE ack=F, but not CLOSE ack=t. This process may not have been able to send CLOSE ack=T. There should not be any outstanding sends or receives on either side. */ MPL_DBG_MSG_D(MPIDI_CH3_DBG_DISCONNECT,TYPICAL, "Connection closed prematurely during close protocol. " "Outstanding close operations = %d", MPIDI_Outstanding_close_ops); MPIDI_CH3U_Complete_posted_with_error(vc); ++MPIDI_Failed_vc_count; MPIDU_Ftb_publish_vc(MPIDU_FTB_EV_UNREACHABLE, vc); MPIDI_CHANGE_VC_STATE(vc, MORIBUND); /* MT: this is not thread safe */ MPIDI_Outstanding_close_ops -= 1; if (MPIDI_Outstanding_close_ops == 0) { MPIDI_CH3_Progress_signal_completion(); mpi_errno = MPIDI_CH3_Channel_close(); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } break; default: { MPL_DBG_MSG_D(MPIDI_CH3_DBG_DISCONNECT,TYPICAL, "Unhandled connection state %d when closing connection",vc->state); mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_INTERN, "**ch3|unhandled_connection_state", "**ch3|unhandled_connection_state %p %d", vc, vc->state); goto fn_fail; break; } } /* FIXME: Decrement the reference count? Who increments? */ /* FIXME: The reference count is often already 0. But not always */ /* MPIR_Object_set_ref(vc, 0); ??? */ /* * FIXME: The VC used in connect accept has a NULL * process group */ /* XXX DJG FIXME-MT should we be checking this ref_count? */ if (vc->pg != NULL && (MPIR_Object_get_ref(vc) == 0)) { /* FIXME: Who increments the reference count that this is decrementing? */ /* When the reference count for a vc becomes zero, decrement the reference count of the associated process group. */ /* FIXME: This should be done when the reference count of the vc is first decremented */ MPIDI_PG_release_ref(vc->pg, &inuse); if (inuse == 0) { MPIDI_PG_Destroy(vc->pg); } } break; } default: { break; } } fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_HANDLE_CONNECTION); return mpi_errno; fn_fail: goto fn_exit; }