Example #1
0
int MPIDI_CH3_RecvRndv( MPIDI_VC_t * vc, MPIR_Request *rreq )
{
    int mpi_errno = MPI_SUCCESS;

    /* A rendezvous request-to-send (RTS) message has arrived.  We need
       to send a CTS message to the remote process. */
    MPIR_Request * cts_req;
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_rndv_clr_to_send_t * cts_pkt = &upkt.rndv_clr_to_send;
    
    MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,
		 "rndv RTS in the request, sending rndv CTS");
    
    MPIDI_Pkt_init(cts_pkt, MPIDI_CH3_PKT_RNDV_CLR_TO_SEND);
    cts_pkt->sender_req_id = rreq->dev.sender_req_id;
    cts_pkt->receiver_req_id = rreq->handle;
    MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex);
    mpi_errno = MPIDI_CH3_iStartMsg(vc, cts_pkt, sizeof(*cts_pkt), &cts_req);
    MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex);
    if (mpi_errno != MPI_SUCCESS) {
	MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER, "**ch3|ctspkt");
    }
    if (cts_req != NULL)
    {
	/* FIXME: Ideally we could specify that a req not be returned.  
	   This would avoid our having to decrement the
	   reference count on a req we don't want/need. */
	MPIR_Request_free(cts_req);
    }

 fn_fail:    
    return mpi_errno;
}
int MPIDI_CH3U_Post_data_receive_unexpected(MPIR_Request * rreq)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_UNEXPECTED);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_UNEXPECTED);

    /* FIXME: to improve performance, allocate temporary buffer from a 
       specialized buffer pool. */
    /* FIXME: to avoid memory exhaustion, integrate buffer pool management
       with flow control */
    MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"unexpected request allocated");
    
    rreq->dev.tmpbuf = MPL_malloc(rreq->dev.recv_data_sz, MPL_MEM_BUFFER);
    if (!rreq->dev.tmpbuf) {
	MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %d",
			     rreq->dev.recv_data_sz);
    }
    rreq->dev.tmpbuf_sz = rreq->dev.recv_data_sz;
    
    rreq->dev.iov[0].MPL_IOV_BUF = (MPL_IOV_BUF_CAST)rreq->dev.tmpbuf;
    rreq->dev.iov[0].MPL_IOV_LEN = rreq->dev.recv_data_sz;
    rreq->dev.iov_count = 1;
    rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_UnpackUEBufComplete;
    rreq->dev.recv_pending_count = 2;

 fn_fail:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_UNEXPECTED);
    return mpi_errno;
}
Example #3
0
static int do_readv(MPID_Request *rreq, int pipe_fd, MPL_IOV iov[],
                    int *iov_offset, int *iov_count, int *complete)
{
    int mpi_errno = MPI_SUCCESS;
    ssize_t nread;

    nread = readv(pipe_fd, &rreq->dev.iov[rreq->dev.iov_offset], rreq->dev.iov_count);
    MPIR_ERR_CHKANDJUMP2(nread < 0 && errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**read",
                         "**readv %d %s", errno, MPIU_Strerror(errno));

    if (nread < 0) {
        if (errno == EAGAIN) goto fn_exit;
        MPIR_ERR_CHKANDJUMP2(errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**vmsplice",
                             "**vmsplice %d %s", errno, MPIU_Strerror(errno));
    }

    *complete = adjust_partially_xferred_iov(iov, iov_offset, iov_count, nread);
    if (*complete) {
        /* look for additional data to send and reload IOV if there is more */
        mpi_errno = check_req_complete(rreq->ch.vc, rreq, complete);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);

        if (*complete) {
            nread = close(pipe_fd);
            MPIR_ERR_CHKANDJUMP(nread < 0, mpi_errno, MPI_ERR_OTHER, "**close");
            MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete");
        }
    }

fn_fail:
fn_exit:
    return mpi_errno;
}
Example #4
0
int MPIDI_CH3_PktPrint_RndvClrToSend( FILE *fp, MPIDI_CH3_Pkt_t *pkt )
{
    MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," type ......... CLR_TO_SEND\n");
    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," sender_reqid . 0x%08X\n", pkt->rndv_clr_to_send.sender_req_id));
    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," recvr_reqid .. 0x%08X\n", pkt->rndv_clr_to_send.receiver_req_id));
    return MPI_SUCCESS;
}
Example #5
0
static int do_vmsplice(MPID_Request *sreq, int pipe_fd, MPL_IOV iov[],
                       int *iov_offset, int *iov_count, int *complete)
{
    int mpi_errno = MPI_SUCCESS;
    ssize_t err;

#if 1
    err = vmsplice(pipe_fd, &iov[*iov_offset], *iov_count, SPLICE_F_NONBLOCK);
#else
    err = writev(pipe_fd, &iov[*iov_offset], *iov_count);
#endif

    if (err < 0) {
        if (errno == EAGAIN) goto fn_exit;
        MPIR_ERR_CHKANDJUMP2(errno != EAGAIN, mpi_errno, MPI_ERR_OTHER, "**vmsplice",
                             "**vmsplice %d %s", errno, MPIU_Strerror(errno));
    }

    *complete = adjust_partially_xferred_iov(iov, iov_offset, iov_count, err);
    if (*complete) {
        /* look for additional data to send and reload IOV if there is more */
        mpi_errno = check_req_complete(sreq->ch.vc, sreq, complete);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);

        if (*complete) {
            err = close(pipe_fd);
            MPIR_ERR_CHKANDJUMP(err < 0, mpi_errno, MPI_ERR_OTHER, "**close");
            MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete");
        }
    }

fn_fail:
fn_exit:
    return mpi_errno;
}
Example #6
0
int MPIDI_CH3_PktHandler_RndvSend( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, 
				   intptr_t *buflen, MPIR_Request **rreqp )
{
    MPIDI_CH3_Pkt_rndv_send_t * rs_pkt = &pkt->rndv_send;
    int mpi_errno = MPI_SUCCESS;
    int complete;
    char *data_buf;
    intptr_t data_len;
    MPIR_Request *req;
    
    MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"received rndv send (data) pkt");

    MPIR_Request_get_ptr(rs_pkt->receiver_req_id, req);

    data_len = ((*buflen - sizeof(MPIDI_CH3_Pkt_t) >= req->dev.recv_data_sz)
                ? req->dev.recv_data_sz : *buflen - sizeof(MPIDI_CH3_Pkt_t));
    data_buf = (char *)pkt + sizeof(MPIDI_CH3_Pkt_t);
    
    if (req->dev.recv_data_sz == 0) {
        *buflen = sizeof(MPIDI_CH3_Pkt_t);
        mpi_errno = MPID_Request_complete(req);
        if (mpi_errno != MPI_SUCCESS) {
            MPIR_ERR_POP(mpi_errno);
        }
	*rreqp = NULL;
    }
    else {
        mpi_errno = MPIDI_CH3U_Receive_data_found(req, data_buf, &data_len,
                                                  &complete);
	if (mpi_errno != MPI_SUCCESS) {
	    MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER, "**ch3|postrecv",
			     "**ch3|postrecv %s", "MPIDI_CH3_PKT_RNDV_SEND");
	}

        *buflen = sizeof(MPIDI_CH3_Pkt_t) + data_len;

        if (complete) 
        {
            mpi_errno = MPID_Request_complete(req);
            if (mpi_errno != MPI_SUCCESS) {
                MPIR_ERR_POP(mpi_errno);
            }
            *rreqp = NULL;
        }
        else
        {
            *rreqp = req;
        }
   }
	
 fn_fail:
    return mpi_errno;
}
Example #7
0
int MPIDI_CH3_PktPrint_EagerSyncSend( FILE *fp, MPIDI_CH3_Pkt_t *pkt )
{
    MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," type ......... EAGER_SYNC_SEND\n");
    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," sender_reqid . 0x%08X\n", pkt->eager_sync_send.sender_req_id));
    MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," context_id ... %d\n", pkt->eager_sync_send.match.parts.context_id);
    MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," tag .......... %d\n", pkt->eager_sync_send.match.parts.tag);
    MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," rank ......... %d\n", pkt->eager_sync_send.match.parts.rank);
    MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," data_sz ...... %d\n", pkt->eager_sync_send.data_sz);
#ifdef MPID_USE_SEQUENCE_NUMBERS
    MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," seqnum ....... %d\n", pkt->eager_sync_send.seqnum);
#endif
    return MPI_SUCCESS;
}
Example #8
0
int MPIR_Thread_CS_Finalize(void)
{
    int err;

    MPL_DBG_MSG(MPIR_DBG_INIT, TYPICAL, "Freeing global mutex and private storage");
#if MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__GLOBAL
/* There is a single, global lock, held for the duration of an MPI call */
    MPID_Thread_mutex_destroy(&MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX, &err);
    MPIR_Assert(err == 0);

#elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__POBJ
    /* MPICH_THREAD_GRANULARITY__POBJ: There are multiple locks,
     * one for each logical class (e.g., each type of object) */
    MPID_Thread_mutex_destroy(&MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX, &err);
    MPIR_Assert(err == 0);
    MPID_Thread_mutex_destroy(&MPIR_THREAD_POBJ_HANDLE_MUTEX, &err);
    MPIR_Assert(err == 0);
    MPID_Thread_mutex_destroy(&MPIR_THREAD_POBJ_MSGQ_MUTEX, &err);
    MPIR_Assert(err == 0);
    MPID_Thread_mutex_destroy(&MPIR_THREAD_POBJ_COMPLETION_MUTEX, &err);
    MPIR_Assert(err == 0);
    MPID_Thread_mutex_destroy(&MPIR_THREAD_POBJ_CTX_MUTEX, &err);
    MPIR_Assert(err == 0);
    MPID_Thread_mutex_destroy(&MPIR_THREAD_POBJ_PMI_MUTEX, &err);
    MPIR_Assert(err == 0);

#elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__VCI
    MPID_Thread_mutex_destroy(&MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX, &err);
    MPIR_Assert(err == 0);
    MPID_Thread_mutex_destroy(&MPIR_THREAD_POBJ_HANDLE_MUTEX, &err);
    MPIR_Assert(err == 0);

#elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__LOCKFREE
/* Updates to shared data and access to shared services is handled without
   locks where ever possible. */
#error lock-free not yet implemented

#elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__SINGLE
/* No thread support, make all operations a no-op */

#else
#error Unrecognized thread granularity
#endif

    MPID_CS_finalize();

    MPID_THREADPRIV_KEY_DESTROY;

    return MPI_SUCCESS;
}
int MPIDI_CH3U_Receive_data_unexpected(MPIR_Request * rreq, void *buf, intptr_t *buflen, int *complete)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_UNEXPECTED);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_UNEXPECTED);

    /* FIXME: to improve performance, allocate temporary buffer from a 
       specialized buffer pool. */
    /* FIXME: to avoid memory exhaustion, integrate buffer pool management
       with flow control */
    MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"unexpected request allocated");
    
    rreq->dev.tmpbuf = MPL_malloc(rreq->dev.recv_data_sz, MPL_MEM_BUFFER);
    if (!rreq->dev.tmpbuf) {
	MPIR_ERR_SETANDJUMP1(mpi_errno,MPI_ERR_OTHER,"**nomem","**nomem %d",
			     rreq->dev.recv_data_sz);
    }
    rreq->dev.tmpbuf_sz = rreq->dev.recv_data_sz;
    
    /* if all of the data has already been received, copy it
       now, otherwise build an iov and let the channel copy it */
    if (rreq->dev.recv_data_sz <= *buflen)
    {
        MPIR_Memcpy(rreq->dev.tmpbuf, buf, rreq->dev.recv_data_sz);
        *buflen = rreq->dev.recv_data_sz;
        rreq->dev.recv_pending_count = 1;
        *complete = TRUE;
    }
    else
    {
        rreq->dev.iov[0].MPL_IOV_BUF = (MPL_IOV_BUF_CAST)((char *)rreq->dev.tmpbuf);
        rreq->dev.iov[0].MPL_IOV_LEN = rreq->dev.recv_data_sz;
        rreq->dev.iov_count = 1;
        rreq->dev.recv_pending_count = 2;
        *buflen = 0;
        *complete = FALSE;
    }

    if (MPIDI_Request_get_msg_type(rreq) == MPIDI_REQUEST_EAGER_MSG)
        MPIR_T_PVAR_LEVEL_INC(RECVQ, unexpected_recvq_buffer_size, rreq->dev.tmpbuf_sz);

    rreq->dev.OnDataAvail = MPIDI_CH3_ReqHandler_UnpackUEBufComplete;

 fn_fail:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_UNEXPECTED);
    return mpi_errno;
}
Example #10
0
/* Send a zero-sized message with eager synchronous.  This is a temporary
   routine, as we may want to replace this with a counterpart to the
   Eager Short message */
int MPIDI_CH3_EagerSyncZero(MPIR_Request **sreq_p, int rank, int tag,
			    MPIR_Comm * comm, int context_offset )
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_eager_sync_send_t * const es_pkt = &upkt.eager_sync_send;
    MPIDI_VC_t * vc;
    MPIR_Request *sreq = *sreq_p;
    
    MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"sending zero length message");
    
    /* MT FIXME what are the two operations we are waiting for?  the send and
     * the sync response? */
    MPIR_cc_set(&sreq->cc, 2);
    MPIDI_Request_set_msg_type(sreq, MPIDI_REQUEST_EAGER_MSG);
    sreq->dev.OnDataAvail = 0;
    
    MPIDI_Pkt_init(es_pkt, MPIDI_CH3_PKT_EAGER_SYNC_SEND);
    es_pkt->match.parts.rank = comm->rank;
    es_pkt->match.parts.tag = tag;
    es_pkt->match.parts.context_id = comm->context_id + context_offset;
    es_pkt->sender_req_id = sreq->handle;
    es_pkt->data_sz = 0;
    
    MPIDI_Comm_get_vc_set_active(comm, rank, &vc);
    MPIDI_VC_FAI_send_seqnum(vc, seqnum);
    MPIDI_Pkt_set_seqnum(es_pkt, seqnum);
    MPIDI_Request_set_seqnum(sreq, seqnum);
    
    MPL_DBG_MSGPKT(vc,tag,es_pkt->match.parts.context_id,rank,(intptr_t)0,"EagerSync0");
    MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex);
    mpi_errno = MPIDI_CH3_iSend(vc, sreq, es_pkt, sizeof(*es_pkt));
    MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex);
    /* --BEGIN ERROR HANDLING-- */
    if (mpi_errno != MPI_SUCCESS)
    {
	MPIR_Request_free(sreq);
	*sreq_p = NULL;
        MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|eagermsg");
    }
    /* --END ERROR HANDLING-- */

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Example #11
0
/* These routine handle any thread initialization that my be required */
static int thread_cs_init(void)
{
    int err;

#if MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__GLOBAL
/* There is a single, global lock, held for the duration of an MPI call */
    MPID_Thread_mutex_create(&MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX, &err);
    MPIR_Assert(err == 0);

#elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__POBJ
    /* MPICH_THREAD_GRANULARITY__POBJ: Multiple locks */
    MPID_Thread_mutex_create(&MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX, &err);
    MPIR_Assert(err == 0);
    MPID_Thread_mutex_create(&MPIR_THREAD_POBJ_HANDLE_MUTEX, &err);
    MPIR_Assert(err == 0);
    MPID_Thread_mutex_create(&MPIR_THREAD_POBJ_MSGQ_MUTEX, &err);
    MPIR_Assert(err == 0);
    MPID_Thread_mutex_create(&MPIR_THREAD_POBJ_COMPLETION_MUTEX, &err);
    MPIR_Assert(err == 0);
    MPID_Thread_mutex_create(&MPIR_THREAD_POBJ_CTX_MUTEX, &err);
    MPIR_Assert(err == 0);
    MPID_Thread_mutex_create(&MPIR_THREAD_POBJ_PMI_MUTEX, &err);
    MPIR_Assert(err == 0);

#elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__VCI
    MPID_Thread_mutex_create(&MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX, &err);
    MPIR_Assert(err == 0);
    MPID_Thread_mutex_create(&MPIR_THREAD_POBJ_HANDLE_MUTEX, &err);
    MPIR_Assert(err == 0);

#elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__LOCKFREE
/* Updates to shared data and access to shared services is handled without
   locks where ever possible. */
#error lock-free not yet implemented

#elif MPICH_THREAD_GRANULARITY == MPICH_THREAD_GRANULARITY__SINGLE
/* No thread support, make all operations a no-op */

#else
#error Unrecognized thread granularity
#endif

    MPID_THREADPRIV_KEY_CREATE;

    MPL_DBG_MSG(MPIR_DBG_INIT, TYPICAL, "Created global mutex and private storage");
    return MPI_SUCCESS;
}
Example #12
0
int MPIDI_CH3_Connection_terminate(MPIDI_VC_t * vc)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_CH3I_VC *vcch = &vc->ch;

    MPL_DBG_CONNSTATECHANGE(vc,vcch->conn,CONN_STATE_CLOSING);
    vcch->conn->state = CONN_STATE_CLOSING;
    MPL_DBG_MSG(MPIDI_CH3_DBG_DISCONNECT,TYPICAL,"Closing sock (Post_close)");
    mpi_errno = MPIDI_CH3I_Sock_post_close(vcch->sock);
    if (mpi_errno != MPI_SUCCESS) {
	MPIR_ERR_POP(mpi_errno);
    }

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Example #13
0
/* 
 * These routines are called when a receive matches an eager sync send 
 */
int MPIDI_CH3_EagerSyncAck( MPIDI_VC_t *vc, MPIR_Request *rreq )
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_eager_sync_ack_t * const esa_pkt = &upkt.eager_sync_ack;
    MPIR_Request * esa_req;
    
    MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"sending eager sync ack");
    MPIDI_Pkt_init(esa_pkt, MPIDI_CH3_PKT_EAGER_SYNC_ACK);
    esa_pkt->sender_req_id = rreq->dev.sender_req_id;
    MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex);
    mpi_errno = MPIDI_CH3_iStartMsg(vc, esa_pkt, sizeof(*esa_pkt), &esa_req);
    MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex);
    if (mpi_errno != MPI_SUCCESS) {
	MPIR_ERR_POP(mpi_errno);
    }
    if (esa_req != NULL)
    {
	MPIR_Request_free(esa_req);
    }
 fn_fail:
    return mpi_errno;
}
Example #14
0
void MPIDI_CH3I_Progress_wakeup(void)
{
    MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TYPICAL,"progress_wakeup called");
    MPIDI_CH3I_Sock_wakeup(MPIDI_CH3I_sock_set);
}
Example #15
0
int MPIDI_CH3_iSend(MPIDI_VC_t * vc, MPIR_Request * sreq, void * hdr,
		    intptr_t hdr_sz)
{
    int mpi_errno = MPI_SUCCESS;
    int (*reqFn)(MPIDI_VC_t *, MPIR_Request *, int *);
    MPIDI_CH3I_VC *vcch = &vc->ch;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_ISEND);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3_ISEND);

    MPIR_Assert( hdr_sz <= sizeof(MPIDI_CH3_Pkt_t) );

    /* The sock channel uses a fixed length header, the size of which is the 
       maximum of all possible packet headers */
    hdr_sz = sizeof(MPIDI_CH3_Pkt_t);
    MPL_DBG_STMT(MPIDI_CH3_DBG_CHANNEL,VERBOSE,
		  MPIDI_DBG_Print_packet((MPIDI_CH3_Pkt_t*)hdr));

    if (vcch->state == MPIDI_CH3I_VC_STATE_CONNECTED) /* MT */
    {
	/* Connection already formed.  If send queue is empty attempt to send 
	   data, queuing any unsent data. */
	if (MPIDI_CH3I_SendQ_empty(vcch)) /* MT */
	{
	    size_t nb;
	    int rc;

	    MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL,VERBOSE,
			 "send queue empty, attempting to write");
	    MPL_DBG_PKT(vcch->conn,hdr,"isend");
	    /* MT: need some signalling to lock down our right to use the 
	       channel, thus insuring that the progress engine does
               also try to write */
	    rc = MPIDI_CH3I_Sock_write(vcch->sock, hdr, hdr_sz, &nb);
	    if (rc == MPI_SUCCESS)
	    {
		MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL,VERBOSE,
			       "wrote %ld bytes", (unsigned long) nb);
		
		if (nb == hdr_sz)
		{
		    MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL,VERBOSE,
                     "write complete %" PRIdPTR " bytes, calling OnDataAvail fcn", nb);
		    reqFn = sreq->dev.OnDataAvail;
		    if (!reqFn) {
			MPIR_Assert(MPIDI_Request_get_type(sreq)!=MPIDI_REQUEST_TYPE_GET_RESP);
                        mpi_errno = MPID_Request_complete(sreq);
                        if (mpi_errno != MPI_SUCCESS) {
                            MPIR_ERR_POP(mpi_errno);
                        }
		    }
		    else {
			int complete;
			mpi_errno = reqFn( vc, sreq, &complete );
			if (mpi_errno) MPIR_ERR_POP(mpi_errno);
			if (!complete) {
			    MPIDI_CH3I_SendQ_enqueue_head(vcch, sreq);
			    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CHANNEL,VERBOSE,
					     (MPL_DBG_FDEST,
                    "posting writev, vc=0x%p, sreq=0x%08x", vc, sreq->handle));
			    vcch->conn->send_active = sreq;
			    mpi_errno = MPIDI_CH3I_Sock_post_writev(
				vcch->conn->sock, sreq->dev.iov, 
				sreq->dev.iov_count, NULL);
			    /* --BEGIN ERROR HANDLING-- */
			    if (mpi_errno != MPI_SUCCESS)
			    {
				mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER,
								 "**ch3|sock|postwrite", "ch3|sock|postwrite %p %p %p",
								 sreq, vcch->conn, vc);
			    }
			    /* --END ERROR HANDLING-- */
			}
		    }
		}
		else
		{
		    MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL,VERBOSE,
                     "partial write of %" PRIdPTR " bytes, request enqueued at head", nb);
		    update_request(sreq, hdr, hdr_sz, nb);
		    MPIDI_CH3I_SendQ_enqueue_head(vcch, sreq);
		    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CHANNEL,VERBOSE,
     (MPL_DBG_FDEST,"posting write, vc=0x%p, sreq=0x%08x", vc, sreq->handle));
		    vcch->conn->send_active = sreq;
		    mpi_errno = MPIDI_CH3I_Sock_post_write(vcch->conn->sock,
					  sreq->dev.iov[0].MPL_IOV_BUF,
				          sreq->dev.iov[0].MPL_IOV_LEN, 
					  sreq->dev.iov[0].MPL_IOV_LEN, NULL);
		    /* --BEGIN ERROR HANDLING-- */
		    if (mpi_errno != MPI_SUCCESS)
		    {
			mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER,
							 "**ch3|sock|postwrite", "ch3|sock|postwrite %p %p %p",
							 sreq, vcch->conn, vc);
		    }
		    /* --END ERROR HANDLING-- */
		}
	    }
	    /* --BEGIN ERROR HANDLING-- */
	    else if (MPIR_ERR_GET_CLASS(rc) == MPIDI_CH3I_SOCK_ERR_NOMEM)
	    {
		MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL,TYPICAL,
			     "MPIDI_CH3I_Sock_write failed, out of memory");
		sreq->status.MPI_ERROR = MPIR_ERR_MEMALLOCFAILED;
	    }
	    else
	    {
		MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL,TYPICAL,
			       "MPIDI_CH3I_Sock_write failed, rc=%d", rc);
		/* Connection just failed. Mark the request complete and 
		   return an error. */
		MPL_DBG_VCCHSTATECHANGE(vc,VC_STATE_FAILED);
		/* FIXME: Shouldn't the vc->state also change? */
		vcch->state = MPIDI_CH3I_VC_STATE_FAILED;
		sreq->status.MPI_ERROR = MPIR_Err_create_code( rc,
			       MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, 
			       MPI_ERR_INTERN, "**ch3|sock|writefailed", 
			       "**ch3|sock|writefailed %d", rc );
		 /* MT -CH3U_Request_complete() performs write barrier */
		MPID_Request_complete(sreq);
		/* Make sure that the caller sees this error */
		mpi_errno = sreq->status.MPI_ERROR;
	    }
	    /* --END ERROR HANDLING-- */
	}
	else
	{
	    MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL,VERBOSE,"send queue not empty, enqueuing");
	    update_request(sreq, hdr, hdr_sz, 0);
	    MPIDI_CH3I_SendQ_enqueue(vcch, sreq);
	}
    }
    else if (vcch->state == MPIDI_CH3I_VC_STATE_CONNECTING) /* MT */
    {
	/* queuing the data so it can be sent later. */
	MPL_DBG_VCUSE(vc,"connecting.  enqueuing request");
	update_request(sreq, hdr, hdr_sz, 0);
	MPIDI_CH3I_SendQ_enqueue(vcch, sreq);
    }
    else if (vcch->state == MPIDI_CH3I_VC_STATE_UNCONNECTED) /* MT */
    {
	/* Form a new connection, queuing the data so it can be sent later. */
	MPL_DBG_VCUSE(vc,"unconnected.  enqueuing request");
	update_request(sreq, hdr, hdr_sz, 0);
	MPIDI_CH3I_SendQ_enqueue(vcch, sreq);
	mpi_errno = MPIDI_CH3I_VC_post_connect(vc);
	if (mpi_errno) {
	    MPIR_ERR_POP(mpi_errno);
	}
    }
    else if (vcch->state != MPIDI_CH3I_VC_STATE_FAILED)
    {
	/* Unable to send data at the moment, so queue it for later */
	MPL_DBG_VCUSE(vc,"still connecting. Enqueuing request");
	update_request(sreq, hdr, hdr_sz, 0);
	MPIDI_CH3I_SendQ_enqueue(vcch, sreq);
    }
    /* --BEGIN ERROR HANDLING-- */
    else
    {
	/* Connection failed.  Mark the request complete and return an error. */
	/* TODO: Create an appropriate error message */
	sreq->status.MPI_ERROR = MPI_ERR_INTERN;
	/* MT - CH3U_Request_complete() performs write barrier */
	MPID_Request_complete(sreq);
    }
    /* --END ERROR HANDLING-- */

 fn_fail:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3_ISEND);
    return mpi_errno;
}
Example #16
0
void MPII_Segment_manipulate(struct MPIR_Segment *segp,
                             MPI_Aint first,
                             MPI_Aint * lastp,
                             int (*contigfn) (MPI_Aint * blocks_p,
                                              MPI_Datatype el_type,
                                              MPI_Aint rel_off,
                                              void *bufp,
                                              void *v_paramp),
                             int (*vectorfn) (MPI_Aint * blocks_p,
                                              MPI_Aint count,
                                              MPI_Aint blklen,
                                              MPI_Aint stride,
                                              MPI_Datatype el_type,
                                              MPI_Aint rel_off,
                                              void *bufp,
                                              void *v_paramp),
                             int (*blkidxfn) (MPI_Aint * blocks_p,
                                              MPI_Aint count,
                                              MPI_Aint blklen,
                                              MPI_Aint * offsetarray,
                                              MPI_Datatype el_type,
                                              MPI_Aint rel_off,
                                              void *bufp,
                                              void *v_paramp),
                             int (*indexfn) (MPI_Aint * blocks_p,
                                             MPI_Aint count,
                                             MPI_Aint * blockarray,
                                             MPI_Aint * offsetarray,
                                             MPI_Datatype el_type,
                                             MPI_Aint rel_off,
                                             void *bufp,
                                             void *v_paramp),
                             MPI_Aint(*sizefn) (MPI_Datatype el_type), void *pieceparams)
{
    /* these four are the "local values": cur_sp, valid_sp, last, stream_off */
    int cur_sp, valid_sp;
    MPI_Aint last, stream_off;

    struct MPII_Dataloop_stackelm *cur_elmp;
    enum { PF_NULL, PF_CONTIG, PF_VECTOR, PF_BLOCKINDEXED, PF_INDEXED } piecefn_type = PF_NULL;

    SEGMENT_LOAD_LOCAL_VALUES;

    if (first == *lastp) {
        /* nothing to do */
        MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE,
                        (MPL_DBG_FDEST,
                         "dloop_segment_manipulate: warning: first == last ("
                         MPI_AINT_FMT_DEC_SPEC ")\n", first));
        return;
    }

    /* first we ensure that stream_off and first are in the same spot */
    if (first != stream_off) {
#ifdef MPII_DATALOOP_DEBUG_MANIPULATE
        MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE,
                        (MPL_DBG_FDEST,
                         "first=" MPI_AINT_FMT_DEC_SPEC "; stream_off="
                         MPI_AINT_FMT_DEC_SPEC "; resetting.\n", first, stream_off));
#endif

        if (first < stream_off) {
            SEGMENT_RESET_VALUES;
            stream_off = 0;
        }

        if (first != stream_off) {
            MPI_Aint tmp_last = first;

            /* use manipulate function with a NULL piecefn to advance
             * stream offset
             */
            MPII_Segment_manipulate(segp, stream_off, &tmp_last, NULL,  /* contig fn */
                                    NULL,       /* vector fn */
                                    NULL,       /* blkidx fn */
                                    NULL,       /* index fn */
                                    sizefn, NULL);

            /* --BEGIN ERROR HANDLING-- */
            /* verify that we're in the right location */
            MPIR_Assert(tmp_last == first);
            /* --END ERROR HANDLING-- */
        }

        SEGMENT_LOAD_LOCAL_VALUES;

#ifdef MPII_DATALOOP_DEBUG_MANIPULATE
        MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE,
                        (MPL_DBG_FDEST,
                         "done repositioning stream_off; first=" MPI_AINT_FMT_DEC_SPEC
                         ", stream_off=" MPI_AINT_FMT_DEC_SPEC ", last="
                         MPI_AINT_FMT_DEC_SPEC "\n", first, stream_off, last));
#endif
    }

    for (;;) {
#ifdef MPII_DATALOOP_DEBUG_MANIPULATE
#if 0
        MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE,
                        (MPL_DBG_FDEST, "looptop; cur_sp=%d, cur_elmp=%x\n", cur_sp,
                         (unsigned) cur_elmp));
#endif
#endif

        if (cur_elmp->loop_p->kind & MPII_DATALOOP_FINAL_MASK) {
            int piecefn_indicated_exit = -1;
            MPI_Aint myblocks, local_el_size, stream_el_size;
            MPI_Datatype el_type;

            /* structs are never finals (leaves) */
            MPIR_Assert((cur_elmp->loop_p->kind & MPII_DATALOOP_KIND_MASK) !=
                        MPII_DATALOOP_KIND_STRUCT);

            /* pop immediately on zero count */
            if (cur_elmp->curcount == 0)
                SEGMENT_POP_AND_MAYBE_EXIT;

            /* size on this system of the int, double, etc. that is
             * the elementary type.
             */
            local_el_size = cur_elmp->loop_p->el_size;
            el_type = cur_elmp->loop_p->el_type;
            stream_el_size = (sizefn) ? sizefn(el_type) : local_el_size;

            /* calculate number of elem. types to work on and function to use.
             * default is to use the contig piecefn (if there is one).
             */
            myblocks = cur_elmp->curblock;
            piecefn_type = (contigfn ? PF_CONTIG : PF_NULL);

            /* check for opportunities to use other piecefns */
            switch (cur_elmp->loop_p->kind & MPII_DATALOOP_KIND_MASK) {
                case MPII_DATALOOP_KIND_CONTIG:
                    break;
                case MPII_DATALOOP_KIND_BLOCKINDEXED:
                    /* only use blkidx piecefn if at start of blkidx type */
                    if (blkidxfn &&
                        cur_elmp->orig_block == cur_elmp->curblock &&
                        cur_elmp->orig_count == cur_elmp->curcount) {
                        /* TODO: RELAX CONSTRAINTS */
                        myblocks = cur_elmp->curblock * cur_elmp->curcount;
                        piecefn_type = PF_BLOCKINDEXED;
                    }
                    break;
                case MPII_DATALOOP_KIND_INDEXED:
                    /* only use index piecefn if at start of the index type.
                     *   count test checks that we're on first block.
                     *   block test checks that we haven't made progress on first block.
                     */
                    if (indexfn &&
                        cur_elmp->orig_count == cur_elmp->curcount &&
                        cur_elmp->curblock == STACKELM_INDEXED_BLOCKSIZE(cur_elmp, 0)) {
                        /* TODO: RELAX CONSTRAINT ON COUNT? */
                        myblocks = cur_elmp->loop_p->loop_params.i_t.total_blocks;
                        piecefn_type = PF_INDEXED;
                    }
                    break;
                case MPII_DATALOOP_KIND_VECTOR:
                    /* only use the vector piecefn if at the start of a
                     * contiguous block.
                     */
                    if (vectorfn && cur_elmp->orig_block == cur_elmp->curblock) {
                        myblocks = cur_elmp->curblock * cur_elmp->curcount;
                        piecefn_type = PF_VECTOR;
                    }
                    break;
                default:
                    /* --BEGIN ERROR HANDLING-- */
                    MPIR_Assert(0);
                    break;
                    /* --END ERROR HANDLING-- */
            }

#ifdef MPII_DATALOOP_DEBUG_MANIPULATE
            MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE,
                            (MPL_DBG_FDEST,
                             "\thit leaf; cur_sp=%d, elmp=%x, piece_sz=" MPI_AINT_FMT_DEC_SPEC
                             "\n", cur_sp, (unsigned) cur_elmp, myblocks * local_el_size));
#endif

            /* enforce the last parameter if necessary by reducing myblocks */
            if (last != MPIR_SEGMENT_IGNORE_LAST &&
                (stream_off + (myblocks * stream_el_size) > last)) {
                myblocks = ((last - stream_off) / stream_el_size);
#ifdef MPII_DATALOOP_DEBUG_MANIPULATE
                MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE,
                                (MPL_DBG_FDEST,
                                 "\tpartial block count=" MPI_AINT_FMT_DEC_SPEC " ("
                                 MPI_AINT_FMT_DEC_SPEC " bytes)\n", myblocks,
                                 myblocks * stream_el_size));
#endif
                if (myblocks == 0) {
                    SEGMENT_SAVE_LOCAL_VALUES;
                    return;
                }
            }

            /* call piecefn to perform data manipulation */
            switch (piecefn_type) {
                case PF_NULL:
                    piecefn_indicated_exit = 0;
#ifdef MPII_DATALOOP_DEBUG_MANIPULATE
                    MPL_DBG_MSG("\tNULL piecefn for this piece\n");
#endif
                    break;
                case PF_CONTIG:
                    MPIR_Assert(myblocks <= cur_elmp->curblock);
                    piecefn_indicated_exit = contigfn(&myblocks, el_type, cur_elmp->curoffset,  /* relative to segp->ptr */
                                                      segp->ptr,        /* start of buffer (from segment) */
                                                      pieceparams);
                    break;
                case PF_VECTOR:
                    piecefn_indicated_exit =
                        vectorfn(&myblocks,
                                 cur_elmp->curcount,
                                 cur_elmp->orig_block,
                                 cur_elmp->loop_p->loop_params.v_t.stride,
                                 el_type, cur_elmp->curoffset, segp->ptr, pieceparams);
                    break;
                case PF_BLOCKINDEXED:
                    piecefn_indicated_exit = blkidxfn(&myblocks, cur_elmp->curcount, cur_elmp->orig_block, cur_elmp->loop_p->loop_params.bi_t.offset_array, el_type, cur_elmp->orig_offset,     /* blkidxfn adds offset */
                                                      segp->ptr, pieceparams);
                    break;
                case PF_INDEXED:
                    piecefn_indicated_exit = indexfn(&myblocks, cur_elmp->curcount, cur_elmp->loop_p->loop_params.i_t.blocksize_array, cur_elmp->loop_p->loop_params.i_t.offset_array, el_type, cur_elmp->orig_offset,  /* indexfn adds offset value */
                                                     segp->ptr, pieceparams);
                    break;
            }

            /* update local values based on piecefn returns (myblocks and
             * piecefn_indicated_exit)
             */
            MPIR_Assert(piecefn_indicated_exit >= 0);
            MPIR_Assert(myblocks >= 0);
            stream_off += myblocks * stream_el_size;

            /* myblocks of 0 or less than cur_elmp->curblock indicates
             * that we should stop processing and return.
             */
            if (myblocks == 0) {
                SEGMENT_SAVE_LOCAL_VALUES;
                return;
            } else if (myblocks < (MPI_Aint) (cur_elmp->curblock)) {
                cur_elmp->curoffset += myblocks * local_el_size;
                cur_elmp->curblock -= myblocks;

                SEGMENT_SAVE_LOCAL_VALUES;
                return;
            } else {    /* myblocks >= cur_elmp->curblock */

                MPI_Aint count_index = 0;

                /* this assumes we're either *just* processing the last parts
                 * of the current block, or we're processing as many blocks as
                 * we like starting at the beginning of one.
                 */

                switch (cur_elmp->loop_p->kind & MPII_DATALOOP_KIND_MASK) {
                    case MPII_DATALOOP_KIND_INDEXED:
                        while (myblocks > 0 && myblocks >= (MPI_Aint) (cur_elmp->curblock)) {
                            myblocks -= (MPI_Aint) (cur_elmp->curblock);
                            cur_elmp->curcount--;
                            MPIR_Assert(cur_elmp->curcount >= 0);

                            count_index = cur_elmp->orig_count - cur_elmp->curcount;
                            cur_elmp->curblock = STACKELM_INDEXED_BLOCKSIZE(cur_elmp, count_index);
                        }

                        if (cur_elmp->curcount == 0) {
                            /* don't bother to fill in values; we're popping anyway */
                            MPIR_Assert(myblocks == 0);
                            SEGMENT_POP_AND_MAYBE_EXIT;
                        } else {
                            cur_elmp->orig_block = cur_elmp->curblock;
                            cur_elmp->curoffset = cur_elmp->orig_offset +
                                STACKELM_INDEXED_OFFSET(cur_elmp, count_index);

                            cur_elmp->curblock -= myblocks;
                            cur_elmp->curoffset += myblocks * local_el_size;
                        }
                        break;
                    case MPII_DATALOOP_KIND_VECTOR:
                        /* this math relies on assertions at top of code block */
                        cur_elmp->curcount -= myblocks / (MPI_Aint) (cur_elmp->curblock);
                        if (cur_elmp->curcount == 0) {
                            MPIR_Assert(myblocks % ((MPI_Aint) (cur_elmp->curblock)) == 0);
                            SEGMENT_POP_AND_MAYBE_EXIT;
                        } else {
                            /* this math relies on assertions at top of code
                             * block
                             */
                            cur_elmp->curblock = cur_elmp->orig_block -
                                (myblocks % (MPI_Aint) (cur_elmp->curblock));
                            /* new offset = original offset +
                             *              stride * whole blocks +
                             *              leftover bytes
                             */
                            cur_elmp->curoffset = cur_elmp->orig_offset +
                                (((MPI_Aint) (cur_elmp->orig_count - cur_elmp->curcount)) *
                                 cur_elmp->loop_p->loop_params.v_t.stride) +
                                (((MPI_Aint) (cur_elmp->orig_block - cur_elmp->curblock)) *
                                 local_el_size);
                        }
                        break;
                    case MPII_DATALOOP_KIND_CONTIG:
                        /* contigs that reach this point have always been
                         * completely processed
                         */
                        MPIR_Assert(myblocks == (MPI_Aint) (cur_elmp->curblock) &&
                                    cur_elmp->curcount == 1);
                        SEGMENT_POP_AND_MAYBE_EXIT;
                        break;
                    case MPII_DATALOOP_KIND_BLOCKINDEXED:
                        while (myblocks > 0 && myblocks >= (MPI_Aint) (cur_elmp->curblock)) {
                            myblocks -= (MPI_Aint) (cur_elmp->curblock);
                            cur_elmp->curcount--;
                            MPIR_Assert(cur_elmp->curcount >= 0);

                            count_index = cur_elmp->orig_count - cur_elmp->curcount;
                            cur_elmp->curblock = cur_elmp->orig_block;
                        }
                        if (cur_elmp->curcount == 0) {
                            /* popping */
                            MPIR_Assert(myblocks == 0);
                            SEGMENT_POP_AND_MAYBE_EXIT;
                        } else {
                            /* cur_elmp->orig_block = cur_elmp->curblock; */
                            cur_elmp->curoffset = cur_elmp->orig_offset +
                                STACKELM_BLOCKINDEXED_OFFSET(cur_elmp, count_index);
                            cur_elmp->curblock -= myblocks;
                            cur_elmp->curoffset += myblocks * local_el_size;
                        }
                        break;
                }
            }

            if (piecefn_indicated_exit) {
                /* piece function indicated that we should quit processing */
                SEGMENT_SAVE_LOCAL_VALUES;
                return;
            }
        } /* end of if leaf */
        else if (cur_elmp->curblock == 0) {
#ifdef MPII_DATALOOP_DEBUG_MANIPULATE
            MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE,
                            (MPL_DBG_FDEST, "\thit end of block; elmp=%x [%d]\n",
                             (unsigned) cur_elmp, cur_sp));
#endif
            cur_elmp->curcount--;

            /* new block.  for indexed and struct reset orig_block.
             * reset curblock for all types
             */
            switch (cur_elmp->loop_p->kind & MPII_DATALOOP_KIND_MASK) {
                case MPII_DATALOOP_KIND_CONTIG:
                case MPII_DATALOOP_KIND_VECTOR:
                case MPII_DATALOOP_KIND_BLOCKINDEXED:
                    break;
                case MPII_DATALOOP_KIND_INDEXED:
                    cur_elmp->orig_block =
                        STACKELM_INDEXED_BLOCKSIZE(cur_elmp,
                                                   cur_elmp->curcount ? cur_elmp->orig_count -
                                                   cur_elmp->curcount : 0);
                    break;
                case MPII_DATALOOP_KIND_STRUCT:
                    cur_elmp->orig_block =
                        STACKELM_STRUCT_BLOCKSIZE(cur_elmp,
                                                  cur_elmp->curcount ? cur_elmp->orig_count -
                                                  cur_elmp->curcount : 0);
                    break;
                default:
                    /* --BEGIN ERROR HANDLING-- */
                    MPIR_Assert(0);
                    break;
                    /* --END ERROR HANDLING-- */
            }
            cur_elmp->curblock = cur_elmp->orig_block;

            if (cur_elmp->curcount == 0) {
#ifdef MPII_DATALOOP_DEBUG_MANIPULATE
                MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE,
                                (MPL_DBG_FDEST, "\talso hit end of count; elmp=%x [%d]\n",
                                 (unsigned) cur_elmp, cur_sp));
#endif
                SEGMENT_POP_AND_MAYBE_EXIT;
            }
        } else {        /* push the stackelm */

            MPII_Dataloop_stackelm *next_elmp;
            MPI_Aint count_index, block_index;

            count_index = cur_elmp->orig_count - cur_elmp->curcount;
            block_index = cur_elmp->orig_block - cur_elmp->curblock;

            /* reload the next stackelm if necessary */
            next_elmp = &(segp->stackelm[cur_sp + 1]);
            if (cur_elmp->may_require_reloading) {
                MPIR_Dataloop *load_dlp = NULL;
                switch (cur_elmp->loop_p->kind & MPII_DATALOOP_KIND_MASK) {
                    case MPII_DATALOOP_KIND_CONTIG:
                    case MPII_DATALOOP_KIND_VECTOR:
                    case MPII_DATALOOP_KIND_BLOCKINDEXED:
                    case MPII_DATALOOP_KIND_INDEXED:
                        load_dlp = cur_elmp->loop_p->loop_params.cm_t.dataloop;
                        break;
                    case MPII_DATALOOP_KIND_STRUCT:
                        load_dlp = STACKELM_STRUCT_DATALOOP(cur_elmp, count_index);
                        break;
                    default:
                        /* --BEGIN ERROR HANDLING-- */
                        MPIR_Assert(0);
                        break;
                        /* --END ERROR HANDLING-- */
                }

#ifdef MPII_DATALOOP_DEBUG_MANIPULATE
                MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE,
                                (MPL_DBG_FDEST, "\tloading dlp=%x, elmp=%x [%d]\n",
                                 (unsigned) load_dlp, (unsigned) next_elmp, cur_sp + 1));
#endif

                MPII_Dataloop_stackelm_load(next_elmp, load_dlp, 1);
            }
#ifdef MPII_DATALOOP_DEBUG_MANIPULATE
            MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE,
                            (MPL_DBG_FDEST, "\tpushing type, elmp=%x [%d], count=%d, block=%d\n",
                             (unsigned) cur_elmp, cur_sp, count_index, block_index));
#endif
            /* set orig_offset and all cur values for new stackelm.
             * this is done in two steps: first set orig_offset based on
             * current stackelm, then set cur values based on new stackelm.
             */
            switch (cur_elmp->loop_p->kind & MPII_DATALOOP_KIND_MASK) {
                case MPII_DATALOOP_KIND_CONTIG:
                    next_elmp->orig_offset = cur_elmp->curoffset +
                        (MPI_Aint) block_index *cur_elmp->loop_p->el_extent;
                    break;
                case MPII_DATALOOP_KIND_VECTOR:
                    /* note: stride is in bytes */
                    next_elmp->orig_offset = cur_elmp->orig_offset +
                        (MPI_Aint) count_index *cur_elmp->loop_p->loop_params.v_t.stride +
                        (MPI_Aint) block_index *cur_elmp->loop_p->el_extent;
                    break;
                case MPII_DATALOOP_KIND_BLOCKINDEXED:
                    next_elmp->orig_offset = cur_elmp->orig_offset +
                        (MPI_Aint) block_index *cur_elmp->loop_p->el_extent +
                        STACKELM_BLOCKINDEXED_OFFSET(cur_elmp, count_index);
                    break;
                case MPII_DATALOOP_KIND_INDEXED:
                    next_elmp->orig_offset = cur_elmp->orig_offset +
                        (MPI_Aint) block_index *cur_elmp->loop_p->el_extent +
                        STACKELM_INDEXED_OFFSET(cur_elmp, count_index);
                    break;
                case MPII_DATALOOP_KIND_STRUCT:
                    next_elmp->orig_offset = cur_elmp->orig_offset +
                        (MPI_Aint) block_index *STACKELM_STRUCT_EL_EXTENT(cur_elmp,
                                                                          count_index) +
                        STACKELM_STRUCT_OFFSET(cur_elmp, count_index);
                    break;
                default:
                    /* --BEGIN ERROR HANDLING-- */
                    MPIR_Assert(0);
                    break;
                    /* --END ERROR HANDLING-- */
            }

#ifdef MPII_DATALOOP_DEBUG_MANIPULATE
            MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE,
                            (MPL_DBG_FDEST,
                             "\tstep 1: next orig_offset = " MPI_AINT_FMT_DEC_SPEC " (0x"
                             MPI_AINT_FMT_HEX_SPEC ")\n", next_elmp->orig_offset,
                             next_elmp->orig_offset));
#endif

            switch (next_elmp->loop_p->kind & MPII_DATALOOP_KIND_MASK) {
                case MPII_DATALOOP_KIND_CONTIG:
                case MPII_DATALOOP_KIND_VECTOR:
                    next_elmp->curcount = next_elmp->orig_count;
                    next_elmp->curblock = next_elmp->orig_block;
                    next_elmp->curoffset = next_elmp->orig_offset;
                    break;
                case MPII_DATALOOP_KIND_BLOCKINDEXED:
                    next_elmp->curcount = next_elmp->orig_count;
                    next_elmp->curblock = next_elmp->orig_block;
                    next_elmp->curoffset = next_elmp->orig_offset +
                        STACKELM_BLOCKINDEXED_OFFSET(next_elmp, 0);
                    break;
                case MPII_DATALOOP_KIND_INDEXED:
                    next_elmp->curcount = next_elmp->orig_count;
                    next_elmp->curblock = STACKELM_INDEXED_BLOCKSIZE(next_elmp, 0);
                    next_elmp->curoffset = next_elmp->orig_offset +
                        STACKELM_INDEXED_OFFSET(next_elmp, 0);
                    break;
                case MPII_DATALOOP_KIND_STRUCT:
                    next_elmp->curcount = next_elmp->orig_count;
                    next_elmp->curblock = STACKELM_STRUCT_BLOCKSIZE(next_elmp, 0);
                    next_elmp->curoffset = next_elmp->orig_offset +
                        STACKELM_STRUCT_OFFSET(next_elmp, 0);
                    break;
                default:
                    /* --BEGIN ERROR HANDLING-- */
                    MPIR_Assert(0);
                    break;
                    /* --END ERROR HANDLING-- */
            }

#ifdef MPII_DATALOOP_DEBUG_MANIPULATE
            MPL_DBG_MSG_FMT(MPIR_DBG_DATATYPE, VERBOSE,
                            (MPL_DBG_FDEST,
                             "\tstep 2: next curoffset = " MPI_AINT_FMT_DEC_SPEC " (0x"
                             MPI_AINT_FMT_HEX_SPEC ")\n", next_elmp->curoffset,
                             next_elmp->curoffset));
#endif

            cur_elmp->curblock--;
            SEGMENT_PUSH;
        }       /* end of else push the stackelm */
    }   /* end of for (;;) */

#ifdef MPII_DATALOOP_DEBUG_MANIPULATE
    MPL_DBG_MSG("hit end of datatype\n");
#endif

    SEGMENT_SAVE_LOCAL_VALUES;
    return;
}
Example #17
0
int MPID_Cancel_send(MPIR_Request * sreq)
{
    MPIDI_VC_t * vc;
    int proto;
    int flag;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_CANCEL_SEND);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_CANCEL_SEND);
    
    MPIR_Assert(sreq->kind == MPIR_REQUEST_KIND__SEND);

    MPIDI_Request_cancel_pending(sreq, &flag);
    if (flag)
    {
	goto fn_exit;
    }

    /*
     * FIXME: user requests returned by MPI_Ibsend() have a NULL comm pointer
     * and no pointer to the underlying communication
     * request.  For now, we simply fail to cancel the request.  In the future,
     * we should add a new request kind to indicate that
     * the request is a BSEND.  Then we can properly cancel the request, much 
     * in the way we do persistent requests.
     */
    if (sreq->comm == NULL)
    {
	goto fn_exit;
    }

    MPIDI_Comm_get_vc_set_active(sreq->comm, sreq->dev.match.parts.rank, &vc);

    proto = MPIDI_Request_get_msg_type(sreq);

    if (proto == MPIDI_REQUEST_SELF_MSG)
    {
	MPIR_Request * rreq;
	
	MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,
		     "attempting to cancel message sent to self");
	
	MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_MSGQ_MUTEX);
	rreq = MPIDI_CH3U_Recvq_FDU(sreq->handle, &sreq->dev.match);
	MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_MSGQ_MUTEX);
	if (rreq)
	{
	    MPIR_Assert(rreq->dev.partner_request == sreq);
	    
	    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
             "send-to-self cancellation successful, sreq=0x%08x, rreq=0x%08x",
						sreq->handle, rreq->handle));

            /* Pull the message out of the unexpected queue since it's
             * being cancelled.  The below request release drops one
             * reference.  We explicitly drop a second reference,
             * because the receive request will never be visible to
             * the user. */
            MPIR_Request_free(rreq);
            MPIR_Request_free(rreq);

	    MPIR_STATUS_SET_CANCEL_BIT(sreq->status, TRUE);
            mpi_errno = MPID_Request_complete(sreq);
            if (mpi_errno != MPI_SUCCESS) {
                MPIR_ERR_POP(mpi_errno);
            }
	}
	else
	{
	    MPIR_STATUS_SET_CANCEL_BIT(sreq->status, FALSE);
	    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
               "send-to-self cancellation failed, sreq=0x%08x, rreq=0x%08x",
						sreq->handle, rreq->handle));
	}
	
	goto fn_exit;
    }

    /* If the message went over a netmod and it provides a cancel_send
       function, call it here. */
#ifdef ENABLE_COMM_OVERRIDES
    if (vc->comm_ops && vc->comm_ops->cancel_send)
    {
        mpi_errno = vc->comm_ops->cancel_send(vc, sreq);
        goto fn_exit;
    }
#endif

    /* Check to see if the send is still in the send queue.  If so, remove it, 
       mark the request and cancelled and complete, and
       release the device's reference to the request object.  
    */
    {
	int cancelled;
	
	if (proto == MPIDI_REQUEST_RNDV_MSG)
	{
	    MPIR_Request * rts_sreq;
	    /* The cancellation of the RTS request needs to be atomic through 
	       the destruction of the RTS request to avoid
               conflict with release of the RTS request if the CTS is received
	       (see handling of a rendezvous CTS packet in
               MPIDI_CH3U_Handle_recv_pkt()).  
	       MPID_Request_fetch_and_clear_rts_sreq() is used to gurantee 
	       that atomicity. */
	    MPIDI_Request_fetch_and_clear_rts_sreq(sreq, &rts_sreq);
	    if (rts_sreq != NULL) 
	    {
		cancelled = FALSE;
		
		/* since we attempted to cancel a RTS request, then we are 
		   responsible for releasing that request */
		MPIR_Request_free(rts_sreq);

		/* --BEGIN ERROR HANDLING-- */
		if (mpi_errno != MPI_SUCCESS)
		{
		    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER,
						     "**ch3|cancelrndv", 0);
		    goto fn_exit;
		}
		/* --END ERROR HANDLING-- */
		
		if (cancelled)
		{
		    MPIR_STATUS_SET_CANCEL_BIT(sreq->status, TRUE);
		    /* no other thread should be waiting on sreq, so it is 
		       safe to reset ref_count and cc */
                    MPIR_cc_set(&sreq->cc, 0);
                    /* FIXME should be a decr and assert, not a set */
		    MPIR_Object_set_ref(sreq, 1);
		    goto fn_exit;
		}
	    }
	}
	else
	{
	    cancelled = FALSE;
	    if (cancelled)
	    {
		MPIR_STATUS_SET_CANCEL_BIT(sreq->status, TRUE);
		/* no other thread should be waiting on sreq, so it is safe to 
		   reset ref_count and cc */
                MPIR_cc_set(&sreq->cc, 0);
                /* FIXME should be a decr and assert, not a set */
		MPIR_Object_set_ref(sreq, 1);
		goto fn_exit;
	    }
	}
    }

    /* Part or all of the message has already been sent, so we need to send a 
       cancellation request to the receiver in an attempt
       to catch the message before it is matched. */
    {
	int was_incomplete;
	MPIDI_CH3_Pkt_t upkt;
	MPIDI_CH3_Pkt_cancel_send_req_t * const csr_pkt = &upkt.cancel_send_req;
	MPIR_Request * csr_sreq;
	
	MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
              "sending cancel request to %d for 0x%08x", 
	      sreq->dev.match.parts.rank, sreq->handle));
	
	/* The completion counter and reference count are incremented to keep 
	   the request around long enough to receive a
	   response regardless of what the user does (free the request before 
	   waiting, etc.). */
	MPIDI_CH3U_Request_increment_cc(sreq, &was_incomplete);
	if (!was_incomplete)
	{
	    /* The reference count is incremented only if the request was 
	       complete before the increment. */
	    MPIR_Request_add_ref( sreq );
	}

	MPIDI_Pkt_init(csr_pkt, MPIDI_CH3_PKT_CANCEL_SEND_REQ);
	csr_pkt->match.parts.rank = sreq->comm->rank;
	csr_pkt->match.parts.tag = sreq->dev.match.parts.tag;
	csr_pkt->match.parts.context_id = sreq->dev.match.parts.context_id;
	csr_pkt->sender_req_id = sreq->handle;
	
	MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex);
	mpi_errno = MPIDI_CH3_iStartMsg(vc, csr_pkt, sizeof(*csr_pkt), &csr_sreq);
	MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex);
	if (mpi_errno != MPI_SUCCESS) {
	    MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|cancelreq");
	}
	if (csr_sreq != NULL)
	{
	    MPIR_Request_free(csr_sreq);
	}
    }
    
    /* FIXME: if send cancellation packets are allowed to arrive out-of-order 
       with respect to send packets, then we need to
       timestamp send and cancel packets to insure that a cancellation request 
       does not bypass the send packet to be cancelled
       and erroneously cancel a previously sent message with the same request 
       handle. */
    /* FIXME: A timestamp is more than is necessary; a message sequence number
       should be adequate. */
 fn_fail:
 fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_CANCEL_SEND);
    return mpi_errno;
}
Example #18
0
int MPIDI_CH3U_Post_data_receive_found(MPIR_Request * rreq)
{
    int mpi_errno = MPI_SUCCESS;	
    int dt_contig;
    MPI_Aint dt_true_lb;
    intptr_t userbuf_sz;
    MPIR_Datatype * dt_ptr = NULL;
    intptr_t data_sz;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_FOUND);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_FOUND);

    MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"posted request found");
	
    MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, 
			    dt_contig, userbuf_sz, dt_ptr, dt_true_lb);
		
    if (rreq->dev.recv_data_sz <= userbuf_sz) {
	data_sz = rreq->dev.recv_data_sz;
    }
    else {
	MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
               "receive buffer too small; message truncated, msg_sz=%" PRIdPTR ", userbuf_sz=%"
					    PRIdPTR,
				 rreq->dev.recv_data_sz, userbuf_sz));
	rreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, 
                     MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TRUNCATE,
		     "**truncate", "**truncate %d %d %d %d", 
		     rreq->status.MPI_SOURCE, rreq->status.MPI_TAG, 
		     rreq->dev.recv_data_sz, userbuf_sz );
	MPIR_STATUS_SET_COUNT(rreq->status, userbuf_sz);
	data_sz = userbuf_sz;
    }

    if (dt_contig && data_sz == rreq->dev.recv_data_sz)
    {
	/* user buffer is contiguous and large enough to store the
	   entire message.  However, we haven't yet *read* the data 
	   (this code describes how to read the data into the destination) */
	MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"IOV loaded for contiguous read");
	rreq->dev.iov[0].MPL_IOV_BUF = 
	    (MPL_IOV_BUF_CAST)((char*)(rreq->dev.user_buf) + dt_true_lb);
	rreq->dev.iov[0].MPL_IOV_LEN = data_sz;
	rreq->dev.iov_count = 1;
	/* FIXME: We want to set the OnDataAvail to the appropriate 
	   function, which depends on whether this is an RMA 
	   request or a pt-to-pt request. */
	rreq->dev.OnDataAvail = 0;
    }
    else {
	/* user buffer is not contiguous or is too small to hold
	   the entire message */
	MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"IOV loaded for non-contiguous read");
	rreq->dev.segment_ptr = MPIR_Segment_alloc( );
        MPIR_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPIR_Segment_alloc");
	MPIR_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, 
			  rreq->dev.datatype, rreq->dev.segment_ptr);
	rreq->dev.segment_first = 0;
	rreq->dev.segment_size = data_sz;
	mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
	if (mpi_errno != MPI_SUCCESS) {
	    MPIR_ERR_SETFATALANDJUMP(mpi_errno,MPI_ERR_OTHER,
				     "**ch3|loadrecviov");
	}
    }

 fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_POST_DATA_RECEIVE_FOUND);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Example #19
0
int MPIDI_CH3_iStartMsgv(MPIDI_VC_t * vc, MPL_IOV * iov, int n_iov, MPIR_Request ** sreq_ptr)
{
    MPIR_Request *sreq = NULL;
    MPIDI_CH3I_VC *vcch = &vc->ch;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_ISTARTMSGV);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3_ISTARTMSGV);

    MPIR_Assert(n_iov <= MPL_IOV_LIMIT);

    /* The SOCK channel uses a fixed length header, the size of which is the
     * maximum of all possible packet headers */
    iov[0].MPL_IOV_LEN = sizeof(MPIDI_CH3_Pkt_t);
    MPL_DBG_STMT(MPIDI_CH3_DBG_CHANNEL, VERBOSE,
                 MPIDI_DBG_Print_packet((MPIDI_CH3_Pkt_t *) iov[0].MPL_IOV_BUF));

    if (vcch->state == MPIDI_CH3I_VC_STATE_CONNECTED) { /* MT */
        /* Connection already formed.  If send queue is empty attempt to send
         * data, queuing any unsent data. */
        if (MPIDI_CH3I_SendQ_empty(vcch)) {     /* MT */
            int rc;
            size_t nb;

            MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, "send queue empty, attempting to write");
            MPL_DBG_PKT(vcch->conn, (MPIDI_CH3_Pkt_t *) iov[0].MPL_IOV_BUF, "isend");

            /* MT - need some signalling to lock down our right to use the
             * channel, thus insuring that the progress engine does
             * also try to write */
            rc = MPIDI_CH3I_Sock_writev(vcch->sock, iov, n_iov, &nb);
            if (rc == MPI_SUCCESS) {
                int offset = 0;

                MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL, VERBOSE,
                              "wrote %ld bytes", (unsigned long) nb);

                while (offset < n_iov) {
                    if (nb >= (int) iov[offset].MPL_IOV_LEN) {
                        nb -= iov[offset].MPL_IOV_LEN;
                        offset++;
                    } else {
                        MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE,
                                    "partial write, request enqueued at head");
                        sreq = create_request(iov, n_iov, offset, nb);
                        if (sreq == NULL) {
                            MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
                        }
                        MPIDI_CH3I_SendQ_enqueue_head(vcch, sreq);
                        MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CHANNEL, VERBOSE,
                                        (MPL_DBG_FDEST, "posting writev, vc=0x%p, sreq=0x%08x", vc,
                                         sreq->handle));
                        vcch->conn->send_active = sreq;
                        mpi_errno =
                            MPIDI_CH3I_Sock_post_writev(vcch->conn->sock, sreq->dev.iov + offset,
                                                        sreq->dev.iov_count - offset, NULL);
                        /* --BEGIN ERROR HANDLING-- */
                        if (mpi_errno != MPI_SUCCESS) {
                            mpi_errno =
                                MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, __func__, __LINE__,
                                                     MPI_ERR_OTHER, "**ch3|sock|postwrite",
                                                     "ch3|sock|postwrite %p %p %p", sreq,
                                                     vcch->conn, vc);
                        }
                        /* --END ERROR HANDLING-- */
                        break;
                    }
                }

                if (offset == n_iov) {
                    MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, "entire write complete");
                }
            }
            /* --BEGIN ERROR HANDLING-- */
            else {
                MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL, TYPICAL,
                              "ERROR - MPIDI_CH3I_Sock_writev failed, rc=%d", rc);
                sreq = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
                if (sreq == NULL) {
                    MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
                }
                MPIR_cc_set(&(sreq->cc), 0);
                sreq->status.MPI_ERROR = MPIR_Err_create_code(rc,
                                                              MPIR_ERR_RECOVERABLE, __func__,
                                                              __LINE__, MPI_ERR_INTERN,
                                                              "**ch3|sock|writefailed",
                                                              "**ch3|sock|writefailed %d", rc);
                /* Make sure that the caller sees this error */
                mpi_errno = sreq->status.MPI_ERROR;
            }
            /* --END ERROR HANDLING-- */
        } else {
            MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, "send in progress, request enqueued");
            sreq = create_request(iov, n_iov, 0, 0);
            if (sreq == NULL) {
                MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
            }
            MPIDI_CH3I_SendQ_enqueue(vcch, sreq);
        }
    } else if (vcch->state == MPIDI_CH3I_VC_STATE_CONNECTING) {
        MPL_DBG_VCUSE(vc, "connecting.  enqueuing request");

        /* queue the data so it can be sent after the connection is formed */
        sreq = create_request(iov, n_iov, 0, 0);
        if (sreq == NULL) {
            MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
        }
        MPIDI_CH3I_SendQ_enqueue(vcch, sreq);
    } else if (vcch->state == MPIDI_CH3I_VC_STATE_UNCONNECTED) {
        MPL_DBG_VCUSE(vc, "unconnected.  posting connect and enqueuing request");

        /* queue the data so it can be sent after the connection is formed */
        sreq = create_request(iov, n_iov, 0, 0);
        if (sreq == NULL) {
            MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
        }
        MPIDI_CH3I_SendQ_enqueue(vcch, sreq);

        /* Form a new connection */
        MPIDI_CH3I_VC_post_connect(vc);
    } else if (vcch->state != MPIDI_CH3I_VC_STATE_FAILED) {
        /* Unable to send data at the moment, so queue it for later */
        MPL_DBG_VCUSE(vc, "forming connection, request enqueued");
        sreq = create_request(iov, n_iov, 0, 0);
        if (sreq == NULL) {
            MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
        }
        MPIDI_CH3I_SendQ_enqueue(vcch, sreq);
    }
    /* --BEGIN ERROR HANDLING-- */
    else {
        /* Connection failed, so allocate a request and return an error. */
        MPL_DBG_VCUSE(vc, "ERROR - connection failed");
        sreq = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
        if (sreq == NULL) {
            MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
        }
        MPIR_cc_set(&(sreq->cc), 0);
        sreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS,
                                                      MPIR_ERR_RECOVERABLE, __func__, __LINE__,
                                                      MPI_ERR_INTERN, "**ch3|sock|connectionfailed",
                                                      0);
        /* Make sure that the caller sees this error */
        mpi_errno = sreq->status.MPI_ERROR;
    }
    /* --END ERROR HANDLING-- */

  fn_fail:
    *sreq_ptr = sreq;
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3_ISTARTMSGV);
    return mpi_errno;
}
Example #20
0
int MPIR_Intercomm_create_impl(MPIR_Comm *local_comm_ptr, int local_leader,
                               MPIR_Comm *peer_comm_ptr, int remote_leader, int tag,
                               MPIR_Comm **new_intercomm_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Context_id_t final_context_id, recvcontext_id;
    int remote_size = 0, *remote_lpids = NULL;
    int comm_info[3];
    int is_low_group = 0;
    int cts_tag;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPIR_COMM_KIND__INTERCOMM_CREATE_IMPL);

    MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPIR_COMM_KIND__INTERCOMM_CREATE_IMPL);

    /* Shift tag into the tagged coll space (tag provided by the user 
       is ignored as of MPI 3.0) */
    cts_tag = MPIR_COMM_KIND__INTERCOMM_CREATE_TAG | MPIR_Process.tagged_coll_mask;

    mpi_errno = MPID_Intercomm_exchange_map(local_comm_ptr, local_leader,
                                            peer_comm_ptr, remote_leader,
                                            &remote_size, &remote_lpids,
                                            &is_low_group);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

    /*
     * Create the contexts.  Each group will have a context for sending
     * to the other group. All processes must be involved.  Because
     * we know that the local and remote groups are disjoint, this
     * step will complete
     */
    MPL_DBG_MSG_FMT(MPIR_DBG_COMM,VERBOSE, (MPL_DBG_FDEST,"About to get contextid (local_size=%d) on rank %d",
                                    local_comm_ptr->local_size, local_comm_ptr->rank ));
    /* In the multi-threaded case, MPIR_Get_contextid_sparse assumes that the
       calling routine already holds the single criticial section */
    /* TODO: Make sure this is tag-safe */
    mpi_errno = MPIR_Get_contextid_sparse( local_comm_ptr, &recvcontext_id, FALSE );
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    MPIR_Assert(recvcontext_id != 0);
    MPL_DBG_MSG_FMT(MPIR_DBG_COMM,VERBOSE, (MPL_DBG_FDEST,"Got contextid=%d", recvcontext_id));

    /* Leaders can now swap context ids and then broadcast the value
       to the local group of processes */
    if (local_comm_ptr->rank == local_leader) {
        MPIR_Context_id_t remote_context_id;

        mpi_errno = MPIC_Sendrecv( &recvcontext_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, remote_leader, cts_tag,
                                      &remote_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, remote_leader, cts_tag,
                                      peer_comm_ptr, MPI_STATUS_IGNORE, &errflag );
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);

        final_context_id = remote_context_id;

        /* Now, send all of our local processes the remote_lpids,
           along with the final context id */
        comm_info[0] = final_context_id;
        MPL_DBG_MSG(MPIR_DBG_COMM,VERBOSE,"About to bcast on local_comm");
        mpi_errno = MPID_Bcast( comm_info, 1, MPI_INT, local_leader, local_comm_ptr, &errflag );
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
        MPL_DBG_MSG_D(MPIR_DBG_COMM,VERBOSE,"end of bcast on local_comm of size %d",
                       local_comm_ptr->local_size );
    }
    else
    {
        /* we're the other processes */
        MPL_DBG_MSG(MPIR_DBG_COMM,VERBOSE,"About to receive bcast on local_comm");
        mpi_errno = MPID_Bcast( comm_info, 1, MPI_INT, local_leader, local_comm_ptr, &errflag );
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

        /* Extract the context and group sign informatin */
        final_context_id = comm_info[0];
    }

    /* At last, we now have the information that we need to build the
       intercommunicator */

    /* All processes in the local_comm now build the communicator */

    mpi_errno = MPIR_Comm_create( new_intercomm_ptr );
    if (mpi_errno) goto fn_fail;

    (*new_intercomm_ptr)->context_id     = final_context_id;
    (*new_intercomm_ptr)->recvcontext_id = recvcontext_id;
    (*new_intercomm_ptr)->remote_size    = remote_size;
    (*new_intercomm_ptr)->local_size     = local_comm_ptr->local_size;
    (*new_intercomm_ptr)->pof2           = local_comm_ptr->pof2;
    (*new_intercomm_ptr)->rank           = local_comm_ptr->rank;
    (*new_intercomm_ptr)->comm_kind      = MPIR_COMM_KIND__INTERCOMM;
    (*new_intercomm_ptr)->local_comm     = 0;
    (*new_intercomm_ptr)->is_low_group   = is_low_group;

    mpi_errno = MPID_Create_intercomm_from_lpids( *new_intercomm_ptr, remote_size, remote_lpids );
    if (mpi_errno) goto fn_fail;

    MPIR_Comm_map_dup(*new_intercomm_ptr, local_comm_ptr, MPIR_COMM_MAP_DIR__L2L);

    /* Inherit the error handler (if any) */
    MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_COMM_MUTEX(local_comm_ptr));
    (*new_intercomm_ptr)->errhandler = local_comm_ptr->errhandler;
    if (local_comm_ptr->errhandler) {
        MPIR_Errhandler_add_ref( local_comm_ptr->errhandler );
    }
    MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_COMM_MUTEX(local_comm_ptr));

    mpi_errno = MPIR_Comm_commit(*new_intercomm_ptr);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);


 fn_exit:
    if (remote_lpids) {
        MPL_free(remote_lpids);
        remote_lpids = NULL;
    }
    MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPIR_COMM_KIND__INTERCOMM_CREATE_IMPL);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Example #21
0
int MPIDI_CH3U_Handle_unordered_recv_pkt(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t * pkt,
					 MPIR_Request ** rreqp)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_HANDLE_UNORDERED_RECV_PKT);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_HANDLE_UNORDERED_RECV_PKT);

    /* FIXME: This should probably be *rreqp = NULL? */
    rreqp = NULL;
    
    switch(pkt->type)
    {
	case MPIDI_CH3_PKT_EAGER_SEND:
	case MPIDI_CH3_PKT_EAGER_SYNC_SEND:
	case MPIDI_CH3_PKT_READY_SEND:
	case MPIDI_CH3_PKT_RNDV_REQ_TO_SEND:
	{
	    MPIDI_CH3_Pkt_send_t * send_pkt = (MPIDI_CH3_Pkt_send_t *) pkt;
	    MPIDI_CH3_Pkt_send_container_t * pc_cur;
	    MPIDI_CH3_Pkt_send_container_t * pc_last;
	    
	    MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,
			 "received (potentially) out-of-order send pkt");
	    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
	          "rank=%d, tag=%d, context=%d seqnum=%d",
		  send_pkt->match.rank, send_pkt->match.tag, 
		  send_pkt->match.context_id, send_pkt->seqnum));
	    MPL_DBG_MSG_FMAT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
              "vc - seqnum_send=%d seqnum_recv=%d reorder_msg_queue=0x%08lx",
	      vc->seqnum_send, vc->seqnum_recv, 
	      (unsigned long) vc->msg_reorder_queue));
	    
	    if (send_pkt->seqnum == vc->seqnum_recv)
	    {
		mpi_errno = MPIDI_CH3U_Handle_ordered_recv_pkt(vc, pkt, rreqp);
		/* --BEGIN ERROR HANDLING-- */
		if (mpi_errno != MPI_SUCCESS)
		{
		    goto fn_exit;
		}
		/* --END ERROR HANDLING-- */
		vc->seqnum_recv++;
		pc_cur = vc->msg_reorder_queue;
		while(pc_cur != NULL && vc->seqnum_recv == pc_cur->pkt.seqnum)
		{
		    pkt = (MPIDI_CH3_Pkt_t *) &pc_cur->pkt;
		    mpi_errno = MPIDI_CH3U_Handle_ordered_recv_pkt(vc, pkt, rreqp);
		    /* --BEGIN ERROR HANDLING-- */
		    if (mpi_errno != MPI_SUCCESS)
		    {
			mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, __func__, __LINE__, MPI_ERR_OTHER,
							 "**ch3|pktordered", 0);
			goto fn_exit;
		    }
		    /* --END ERROR HANDLING-- */
		    vc->seqnum_recv++;
		    pc_last = pc_cur;
		    pc_cur = pc_cur->next;
		    MPIDI_CH3U_Pkt_send_container_free(pc_last);
		}
		vc->msg_reorder_queue = pc_cur;
	    }
	    else
	    {
		MPIDI_CH3_Pkt_send_container_t * pc_new;
	
		/* allocate container and copy packet */
		pc_new = MPIDI_CH3U_Pkt_send_container_alloc();
		/* --BEGIN ERROR HANDLING-- */
		if (pc_new == NULL)
		{
		    mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, __func__, __LINE__, MPI_ERR_OTHER,
						     "**ch3|nopktcontainermem", 0);
		    goto fn_exit;
		}
		/* --END ERROR HANDLING-- */
		pc_new->pkt = *send_pkt;

		/* insert packet into reorder queue */
		pc_last = NULL;
		pc_cur = vc->msg_reorder_queue;
		while (pc_cur != NULL)
		{
		    /* the current recv seqnum is subtracted from both the 
		       seqnums prior to comparision so as to remove any wrap
		       around effects. */
		    if (pc_new->pkt.seqnum - vc->seqnum_recv < 
			pc_cur->pkt.seqnum - vc->seqnum_recv)
		    {
			break;
		    }

		    pc_last = pc_cur;
		    pc_cur = pc_cur->next;
		}

		if (pc_last == NULL)
		{
		    pc_new->next = pc_cur;
		    vc->msg_reorder_queue = pc_new;
		}
		else
		{
		    pc_new->next = pc_cur;
		    pc_last->next = pc_new;
		}
	    }

	    break;
	}

	case MPIDI_CH3_PKT_CANCEL_SEND_REQ:
	{
	    /* --BEGIN ERROR HANDLING-- */
	    /* FIXME: processing send cancel requests requires that we be 
	       aware of pkts in the reorder queue */
	    mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER,
					     "**ch3|ooocancelreq", 0);
	    goto fn_exit;
	    break;
	    /* --END ERROR HANDLING-- */
	}
	
	default:
	{
	    mpi_errno = MPIDI_CH3U_Handle_ordered_recv_pkt(vc, pkt, rreqp);
	    break;
	}
    }

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_HANDLE_UNORDERED_RECV_PKT);
    return mpi_errno;
}
Example #22
0
int MPID_Send(const void * buf, MPI_Aint count, MPI_Datatype datatype, int rank,
	      int tag, MPIR_Comm * comm, int context_offset,
	      MPIR_Request ** request)
{
    intptr_t data_sz;
    int dt_contig;
    MPI_Aint dt_true_lb;
    MPIR_Datatype* dt_ptr;
    MPIR_Request * sreq = NULL;
    MPIDI_VC_t * vc;
#if defined(MPID_USE_SEQUENCE_NUMBERS)
    MPID_Seqnum_t seqnum;
#endif    
    int eager_threshold = -1;
    int mpi_errno = MPI_SUCCESS;    
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_SEND);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_SEND);

    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
                "rank=%d, tag=%d, context=%d", 
		rank, tag, comm->context_id + context_offset));

    /* Check to make sure the communicator hasn't already been revoked */
    if (comm->revoked &&
            MPIR_AGREE_TAG != MPIR_TAG_MASK_ERROR_BITS(tag & ~MPIR_Process.tagged_coll_mask) &&
            MPIR_SHRINK_TAG != MPIR_TAG_MASK_ERROR_BITS(tag & ~MPIR_Process.tagged_coll_mask)) {
        MPIR_ERR_SETANDJUMP(mpi_errno,MPIX_ERR_REVOKED,"**revoked");
    }

    if (rank == comm->rank && comm->comm_kind != MPIR_COMM_KIND__INTERCOMM)
    {
	mpi_errno = MPIDI_Isend_self(buf, count, datatype, rank, tag, comm, 
				     context_offset, MPIDI_REQUEST_TYPE_SEND, 
				     &sreq);

	/* In the single threaded case, sending to yourself will cause 
	   deadlock.  Note that in the runtime-thread case, this check
	   will not be made (long-term FIXME) */
#       ifndef MPICH_IS_THREADED
	{
	    if (sreq != NULL && MPIR_cc_get(sreq->cc) != 0) {
		MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,
				    "**dev|selfsenddeadlock");
	    }
	}
#	endif
	if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); }
	goto fn_exit;
    }

    if (rank == MPI_PROC_NULL)
    {
	goto fn_exit;
    }

    MPIDI_Comm_get_vc_set_active(comm, rank, &vc);
    MPIR_ERR_CHKANDJUMP1(vc->state == MPIDI_VC_STATE_MORIBUND, mpi_errno, MPIX_ERR_PROC_FAILED, "**comm_fail", "**comm_fail %d", rank);

#ifdef ENABLE_COMM_OVERRIDES
    if (vc->comm_ops && vc->comm_ops->send)
    {
	mpi_errno = vc->comm_ops->send( vc, buf, count, datatype, rank, tag, comm, context_offset, &sreq);
	goto fn_exit;
    }
#endif

    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, 
			    dt_true_lb);


    if (data_sz == 0)
    {
	MPIDI_CH3_Pkt_t upkt;
	MPIDI_CH3_Pkt_eager_send_t * const eager_pkt = &upkt.eager_send;

	MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"sending zero length message");
	MPIDI_Pkt_init(eager_pkt, MPIDI_CH3_PKT_EAGER_SEND);
	eager_pkt->match.parts.rank = comm->rank;
	eager_pkt->match.parts.tag = tag;
	eager_pkt->match.parts.context_id = comm->context_id + context_offset;
	eager_pkt->sender_req_id = MPI_REQUEST_NULL;
	eager_pkt->data_sz = 0;
	
	MPIDI_VC_FAI_send_seqnum(vc, seqnum);
	MPIDI_Pkt_set_seqnum(eager_pkt, seqnum);
	
	MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex);
	mpi_errno = MPIDI_CH3_iStartMsg(vc, eager_pkt, sizeof(*eager_pkt), &sreq);
	MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex);
	/* --BEGIN ERROR HANDLING-- */
	if (mpi_errno != MPI_SUCCESS)
	{
	    MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|eagermsg");
	}
	/* --END ERROR HANDLING-- */
	if (sreq != NULL)
	{
	    MPIDI_Request_set_seqnum(sreq, seqnum);
	    MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND);
	    /* sreq->comm = comm;
	      MPIR_Comm_add_ref(comm); -- not necessary for blocking functions */
	}
	
	goto fn_exit;
    }

    MPIDI_CH3_GET_EAGER_THRESHOLD(&eager_threshold, comm, vc);

    /* FIXME: flow control: limit number of outstanding eager messages
       containing data and need to be buffered by the receiver */
#ifdef USE_EAGER_SHORT
    if (dt_contig && data_sz <= MPIDI_EAGER_SHORT_SIZE) {
	mpi_errno = MPIDI_CH3_EagerContigShortSend( &sreq, 
					       MPIDI_CH3_PKT_EAGERSHORT_SEND,
					       (char *)buf + dt_true_lb,
					       data_sz, rank, tag, comm, 
					       context_offset );
    }
    else
#endif

    if (data_sz + sizeof(MPIDI_CH3_Pkt_eager_send_t) <= eager_threshold)
    {
	if (dt_contig)
        {
 	    mpi_errno = MPIDI_CH3_EagerContigSend( &sreq, 
						   MPIDI_CH3_PKT_EAGER_SEND,
						   (char *)buf + dt_true_lb,
						   data_sz, rank, tag, comm, 
						   context_offset );
	}
	else
        {
	    MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit);
	    MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SEND);
	    mpi_errno = MPIDI_CH3_EagerNoncontigSend( &sreq, 
                                                      MPIDI_CH3_PKT_EAGER_SEND,
                                                      buf, count, datatype,
                                                      data_sz, rank, tag, 
                                                      comm, context_offset );
	}
    }
    else
    {
Example #23
0
/* MPIDI_CH3I_SendNoncontig - Sends a message by packing
   directly into cells.  The caller must initialize sreq->dev.segment
   as well as segment_first and segment_size. */
int MPIDI_CH3I_SendNoncontig( MPIDI_VC_t *vc, MPIR_Request *sreq, void *header, intptr_t hdr_sz,
                              MPL_IOV *hdr_iov, int n_hdr_iov)
{
    int mpi_errno = MPI_SUCCESS;
    int again = 0;
    intptr_t orig_segment_first = sreq->dev.segment_first;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_SENDNONCONTIG);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3I_SENDNONCONTIG);

    MPIDI_DBG_Print_packet((MPIDI_CH3_Pkt_t *)header);

    MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);

    if (n_hdr_iov > 0) {
        /* translate segments to iovs and combine with the extended header iov. */
        mpi_errno = MPIDI_CH3_SendNoncontig_iov(vc, sreq, header, hdr_sz,
                                                hdr_iov, n_hdr_iov);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        goto fn_exit;
    }

    if (!MPIDI_CH3I_Sendq_empty(MPIDI_CH3I_shm_sendq)) /* MT */
    {
        /* send queue is not empty, enqueue the request then check to
           see if we can send any now */

        MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER, TERSE, "enqueuing");

	sreq->dev.pending_pkt = *(MPIDI_CH3_Pkt_t *)header;
        sreq->ch.noncontig    = TRUE;
        sreq->ch.header_sz    = hdr_sz;
	sreq->ch.vc           = vc;

        MPIDI_CH3I_Sendq_enqueue(&MPIDI_CH3I_shm_sendq, sreq);
        mpi_errno = MPIDI_CH3I_Shm_send_progress();
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
        goto fn_exit;
    }

    /* send as many cells of data as you can */
    MPID_nem_mpich_send_seg_header(sreq->dev.segment_ptr, &sreq->dev.segment_first, sreq->dev.segment_size,
                                   header, hdr_sz, vc, &again);
    while(!again && sreq->dev.segment_first < sreq->dev.segment_size)
        MPID_nem_mpich_send_seg(sreq->dev.segment_ptr, &sreq->dev.segment_first, sreq->dev.segment_size, vc, &again);

    if (again)
    {
        /* we didn't finish sending everything */
        sreq->ch.noncontig = TRUE;
        sreq->ch.vc = vc;
        if (sreq->dev.segment_first == orig_segment_first) /* nothing was sent, save header */
        {
            sreq->dev.pending_pkt = *(MPIDI_CH3_Pkt_t *)header;
            sreq->ch.header_sz    = hdr_sz;
        }
        else
        {
            /* part of message was sent, make this req an active send */
            MPIR_Assert(MPIDI_CH3I_shm_active_send == NULL);
            MPIDI_CH3I_shm_active_send = sreq;
        }
        MPIDI_CH3I_Sendq_enqueue(&MPIDI_CH3I_shm_sendq, sreq);
        goto fn_exit;
    }

    /* finished sending all data, complete the request */
    if (!sreq->dev.OnDataAvail)
    {
        MPIR_Assert(MPIDI_Request_get_type(sreq) != MPIDI_REQUEST_TYPE_GET_RESP);
        mpi_errno = MPID_Request_complete(sreq);
        if (mpi_errno != MPI_SUCCESS) {
            MPIR_ERR_POP(mpi_errno);
        }
        MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete %d bytes", (int) (sreq->dev.segment_size));
    }
    else
    {
        int complete = 0;
        mpi_errno = sreq->dev.OnDataAvail(vc, sreq, &complete);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
        MPIR_Assert(complete); /* all data has been sent, we should always complete */

        MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete %d bytes", (int) (sreq->dev.segment_size));
    }

 fn_exit:
    MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3I_SENDNONCONTIG);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Example #24
0
int MPIDI_CH3_PktPrint_Close( FILE *fp, MPIDI_CH3_Pkt_t *pkt )
{
    MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," type ......... MPIDI_CH3_PKT_CLOSE\n");
    MPL_DBG_MSG_S(MPIDI_CH3_DBG_OTHER,TERSE," ack ......... %s\n", pkt->close.ack ? "TRUE" : "FALSE");
    return MPI_SUCCESS;
}
Example #25
0
int MPIR_Get_contextid_sparse_group(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, int tag,
                                    MPIR_Context_id_t * context_id, int ignore_id)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    struct gcn_state st;
    struct gcn_state *tmp;
    MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPIR_GET_CONTEXTID);

    MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPIR_GET_CONTEXTID);

    st.first_iter = 1;
    st.comm_ptr = comm_ptr;
    st.tag = tag;
    st.own_mask = 0;
    st.own_eager_mask = 0;
    /* Group-collective and ignore_id should never be combined */
    MPIR_Assert(!(group_ptr != NULL && ignore_id));

    *context_id = 0;

    MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST,
                                     "Entering; shared state is %d:%d, my ctx id is %d, tag=%d",
                                     mask_in_use, eager_in_use, comm_ptr->context_id, tag));

    while (*context_id == 0) {
        /* We lock only around access to the mask (except in the global locking
         * case).  If another thread is using the mask, we take a mask of zero. */
        MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);

        if (initialize_context_mask) {
            context_id_init();
        }

        if (eager_nelem < 0) {
            /* Ensure that at least one word of deadlock-free context IDs is
             * always set aside for the base protocol */
            MPIR_Assert(MPIR_CVAR_CTXID_EAGER_SIZE >= 0 &&
                        MPIR_CVAR_CTXID_EAGER_SIZE < MPIR_MAX_CONTEXT_MASK - 1);
            eager_nelem = MPIR_CVAR_CTXID_EAGER_SIZE;
        }

        if (ignore_id) {
            /* We are not participating in the resulting communicator, so our
             * context ID space doesn't matter.  Set the mask to "all available". */
            memset(st.local_mask, 0xff, MPIR_MAX_CONTEXT_MASK * sizeof(int));
            st.own_mask = 0;
            /* don't need to touch mask_in_use/lowest_context_id b/c our thread
             * doesn't ever need to "win" the mask */
        }

        /* Deadlock avoidance: Only participate in context id loop when all
         * processes have called this routine.  On the first iteration, use the
         * "eager" allocation protocol.
         */
        else if (st.first_iter) {
            memset(st.local_mask, 0, MPIR_MAX_CONTEXT_MASK * sizeof(int));
            st.own_eager_mask = 0;
            /* Attempt to reserve the eager mask segment */
            if (!eager_in_use && eager_nelem > 0) {
                int i;
                for (i = 0; i < eager_nelem; i++)
                    st.local_mask[i] = context_mask[i];

                eager_in_use = 1;
                st.own_eager_mask = 1;
            }
        }

        else {
            MPIR_Assert(next_gcn != NULL);
            /*If we are here, at least one element must be in the list, at least myself */

            /* only the first element in the list can own the mask. However, maybe the mask is used
             * by another thread, which added another allcoation to the list bevore. So we have to check,
             * if the mask is used and mark, if we own it */
            if (mask_in_use || &st != next_gcn) {
                memset(st.local_mask, 0, MPIR_MAX_CONTEXT_MASK * sizeof(int));
                st.own_mask = 0;
                MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST,
                                                 "Mask is in use, my context_id is %d, owner context id is %d",
                                                 st.comm_ptr->context_id,
                                                 next_gcn->comm_ptr->context_id));
            }
            else {
                int i;
                /* Copy safe mask segment to local_mask */
                for (i = 0; i < eager_nelem; i++)
                    st.local_mask[i] = 0;
                for (i = eager_nelem; i < MPIR_MAX_CONTEXT_MASK; i++)
                    st.local_mask[i] = context_mask[i];

                mask_in_use = 1;
                st.own_mask = 1;
                MPL_DBG_MSG(MPIR_DBG_COMM, VERBOSE, "Copied local_mask");
            }
        }
        MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);

        /* Note: MPIR_MAX_CONTEXT_MASK elements of local_mask are used by the
         * context ID allocation algorithm.  The additional element is ignored
         * by the context ID mask access routines and is used as a flag for
         * detecting context ID exhaustion (explained below). */
        if (st.own_mask || ignore_id)
            st.local_mask[ALL_OWN_MASK_FLAG] = 1;
        else
            st.local_mask[ALL_OWN_MASK_FLAG] = 0;

        /* Now, try to get a context id */
        MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM);
        /* In the global and brief-global cases, note that this routine will
         * release that global lock when it needs to wait.  That will allow
         * other processes to enter the global or brief global critical section.
         */
        if (group_ptr != NULL) {
            int coll_tag = tag | MPIR_Process.tagged_coll_mask; /* Shift tag into the tagged coll space */
            mpi_errno = MPIR_Allreduce_group(MPI_IN_PLACE, st.local_mask, MPIR_MAX_CONTEXT_MASK + 1,
                                             MPI_INT, MPI_BAND, comm_ptr, group_ptr, coll_tag,
                                             &errflag);
        }
        else {
            mpi_errno = MPID_Allreduce(MPI_IN_PLACE, st.local_mask, MPIR_MAX_CONTEXT_MASK + 1,
                                            MPI_INT, MPI_BAND, comm_ptr, &errflag);
        }
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

        /* MT FIXME 2/3 cases don't seem to need the CONTEXTID CS, check and
         * narrow this region */
        MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
        if (ignore_id) {
            /* we don't care what the value was, but make sure that everyone
             * who did care agreed on a value */
            *context_id = locate_context_bit(st.local_mask);
            /* used later in out-of-context ids check and outer while loop condition */
        }
        else if (st.own_eager_mask) {
            /* There is a chance that we've found a context id */
            /* Find_and_allocate_context_id updates the context_mask if it finds a match */
            *context_id = find_and_allocate_context_id(st.local_mask);
            MPL_DBG_MSG_D(MPIR_DBG_COMM, VERBOSE, "Context id is now %hd", *context_id);

            st.own_eager_mask = 0;
            eager_in_use = 0;
            if (*context_id <= 0) {
                /* else we did not find a context id. Give up the mask in case
                 * there is another thread (with a lower input context id)
                 * waiting for it.  We need to ensure that any other threads
                 * have the opportunity to run, hence yielding */
                /* FIXME: Do we need to do an GLOBAL yield here?
                 * When we do a collective operation, we anyway yield
                 * for other others */
                MPID_THREAD_CS_YIELD(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
                MPID_THREAD_CS_YIELD(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
            }
        }
        else if (st.own_mask) {
            /* There is a chance that we've found a context id */
            /* Find_and_allocate_context_id updates the context_mask if it finds a match */
            *context_id = find_and_allocate_context_id(st.local_mask);
            MPL_DBG_MSG_D(MPIR_DBG_COMM, VERBOSE, "Context id is now %hd", *context_id);

            mask_in_use = 0;

            if (*context_id > 0) {
                /* If we found a new context id, we have to remove the element from the list, so the
                 * next allocation can own the mask */
                if (next_gcn == &st) {
                    next_gcn = st.next;
                }
                else {
                    for (tmp = next_gcn; tmp->next != &st; tmp = tmp->next);    /* avoid compiler warnings */
                    tmp->next = st.next;
                }
            }
            else {
                /* else we did not find a context id. Give up the mask in case
                 * there is another thread in the gcn_next_list
                 * waiting for it.  We need to ensure that any other threads
                 * have the opportunity to run, hence yielding */
                /* FIXME: Do we need to do an GLOBAL yield here?
                 * When we do a collective operation, we anyway yield
                 * for other others */
                MPID_THREAD_CS_YIELD(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
                MPID_THREAD_CS_YIELD(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
            }
        }
        else {
            /* As above, force this thread to yield */
            /* FIXME: Do we need to do an GLOBAL yield here?  When we
             * do a collective operation, we anyway yield for other
             * others */
            MPID_THREAD_CS_YIELD(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
            MPID_THREAD_CS_YIELD(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
        }
        MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);

        /* Test for context ID exhaustion: All threads that will participate in
         * the new communicator owned the mask and could not allocate a context
         * ID.  This indicates that either some process has no context IDs
         * available, or that some are available, but the allocation cannot
         * succeed because there is no common context ID. */
        if (*context_id == 0 && st.local_mask[ALL_OWN_MASK_FLAG] == 1) {
            /* --BEGIN ERROR HANDLING-- */
            int nfree = 0;
            int ntotal = 0;
            int minfree;

            if (st.own_mask) {
                MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
                mask_in_use = 0;
                MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
            }

            context_mask_stats(&nfree, &ntotal);
            if (ignore_id)
                minfree = INT_MAX;
            else
                minfree = nfree;

            if (group_ptr != NULL) {
                int coll_tag = tag | MPIR_Process.tagged_coll_mask;     /* Shift tag into the tagged coll space */
                mpi_errno = MPIR_Allreduce_group(MPI_IN_PLACE, &minfree, 1, MPI_INT, MPI_MIN,
                                                 comm_ptr, group_ptr, coll_tag, &errflag);
            }
            else {
                mpi_errno = MPID_Allreduce(MPI_IN_PLACE, &minfree, 1, MPI_INT,
                                                MPI_MIN, comm_ptr, &errflag);
            }

            if (minfree > 0) {
                MPIR_ERR_SETANDJUMP3(mpi_errno, MPI_ERR_OTHER,
                                     "**toomanycommfrag", "**toomanycommfrag %d %d %d",
                                     nfree, ntotal, ignore_id);
            }
            else {
                MPIR_ERR_SETANDJUMP3(mpi_errno, MPI_ERR_OTHER,
                                     "**toomanycomm", "**toomanycomm %d %d %d",
                                     nfree, ntotal, ignore_id);
            }
            /* --END ERROR HANDLING-- */
        }
        if (st.first_iter == 1) {
            st.first_iter = 0;
            /* to avoid deadlocks, the element is not added to the list bevore the first iteration */
            if (!ignore_id && *context_id == 0) {
                MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
                add_gcn_to_list(&st);
                MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
            }
        }
    }

  fn_exit:
    if (ignore_id)
        *context_id = MPIR_INVALID_CONTEXT_ID;
    MPL_DBG_MSG_S(MPIR_DBG_COMM, VERBOSE, "Context mask = %s", context_mask_to_str());
    MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPIR_GET_CONTEXTID);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    /* Release the masks */
    MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
    if (st.own_mask) {
        mask_in_use = 0;
    }
    /*If in list, remove it */
    if (!st.first_iter && !ignore_id) {
        if (next_gcn == &st) {
            next_gcn = st.next;
        }
        else {
            for (tmp = next_gcn; tmp->next != &st; tmp = tmp->next);
            tmp->next = st.next;
        }
    }
    MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);


    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Example #26
0
int MPIDI_CH3U_Receive_data_found(MPIR_Request *rreq, void *buf, intptr_t *buflen, int *complete)
{
    int dt_contig;
    MPI_Aint dt_true_lb;
    intptr_t userbuf_sz;
    MPIR_Datatype * dt_ptr = NULL;
    intptr_t data_sz;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_FOUND);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_FOUND);

    MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"posted request found");
	
    MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, 
			    dt_contig, userbuf_sz, dt_ptr, dt_true_lb);
		
    if (rreq->dev.recv_data_sz <= userbuf_sz) {
	data_sz = rreq->dev.recv_data_sz;
    }
    else {
	MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
               "receive buffer too small; message truncated, msg_sz=%" PRIdPTR ", userbuf_sz=%"
					    PRIdPTR,
				 rreq->dev.recv_data_sz, userbuf_sz));
	rreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS, 
                     MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_TRUNCATE,
		     "**truncate", "**truncate %d %d %d %d", 
		     rreq->status.MPI_SOURCE, rreq->status.MPI_TAG, 
		     rreq->dev.recv_data_sz, userbuf_sz );
	MPIR_STATUS_SET_COUNT(rreq->status, userbuf_sz);
	data_sz = userbuf_sz;
    }

    if (dt_contig && data_sz == rreq->dev.recv_data_sz)
    {
	/* user buffer is contiguous and large enough to store the
	   entire message.  However, we haven't yet *read* the data 
	   (this code describes how to read the data into the destination) */

        /* if all of the data has already been received, unpack it
           now, otherwise build an iov and let the channel unpack */
        if (*buflen >= data_sz)
        {
            MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"Copying contiguous data to user buffer");
            /* copy data out of the receive buffer */
            if (rreq->dev.drop_data == FALSE) {
                MPIR_Memcpy((char*)(rreq->dev.user_buf) + dt_true_lb, buf, data_sz);
            }
            *buflen = data_sz;
            *complete = TRUE;
        }
        else
        {
            MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"IOV loaded for contiguous read");
            
            rreq->dev.iov[0].MPL_IOV_BUF = 
                (MPL_IOV_BUF_CAST)((char*)(rreq->dev.user_buf) + dt_true_lb);
            rreq->dev.iov[0].MPL_IOV_LEN = data_sz;
            rreq->dev.iov_count = 1;
            *buflen = 0;
            *complete = FALSE;
        }
        
        /* Trigger OnFinal when receiving the last segment */
        rreq->dev.OnDataAvail = rreq->dev.OnFinal;
    }
    else {
	/* user buffer is not contiguous or is too small to hold
	   the entire message */
        
	rreq->dev.segment_ptr = MPIR_Segment_alloc( );
        MPIR_ERR_CHKANDJUMP1((rreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPIR_Segment_alloc");

 	MPIR_Segment_init(rreq->dev.user_buf, rreq->dev.user_count, 
			  rreq->dev.datatype, rreq->dev.segment_ptr);
	rreq->dev.segment_first = 0;
	rreq->dev.segment_size  = data_sz;

        /* if all of the data has already been received, and the
           message is not truncated, unpack it now, otherwise build an
           iov and let the channel unpack */
        if (data_sz == rreq->dev.recv_data_sz && *buflen >= data_sz)
        {
            intptr_t last;
            MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"Copying noncontiguous data to user buffer");
            last = data_sz;
            MPIR_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, 
				&last, buf);
            /* --BEGIN ERROR HANDLING-- */
            if (last != data_sz)
            {
                /* If the data can't be unpacked, the we have a
                   mismatch between the datatype and the amount of
                   data received.  Throw away received data. */
                MPIR_ERR_SET(rreq->status.MPI_ERROR, MPI_ERR_TYPE, "**dtypemismatch");
                MPIR_STATUS_SET_COUNT(rreq->status, rreq->dev.segment_first);
                *buflen = data_sz;
                *complete = TRUE;
		/* FIXME: Set OnDataAvail to 0?  If not, why not? */
                goto fn_exit;
            }
            /* --END ERROR HANDLING-- */
            *buflen = data_sz;
            /* Trigger OnFinal when receiving the last segment */
            rreq->dev.OnDataAvail = rreq->dev.OnFinal;
            *complete = TRUE;
        }
        else
        {   
            MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"IOV loaded for non-contiguous read");

            mpi_errno = MPIDI_CH3U_Request_load_recv_iov(rreq);
            if (mpi_errno != MPI_SUCCESS) {
                MPIR_ERR_SETFATALANDJUMP(mpi_errno,MPI_ERR_OTHER,
                                         "**ch3|loadrecviov");
            }
            *buflen = 0;
            *complete = FALSE;
        }
    }

 fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_RECEIVE_DATA_FOUND);
    return mpi_errno;
fn_fail:
    goto fn_exit;
}
Example #27
0
int MPIDI_CH3U_Get_failed_group(int last_rank, MPIR_Group **failed_group)
{
    char *c;
    int i, mpi_errno = MPI_SUCCESS, rank;
    UT_array *failed_procs = NULL;
    MPIR_Group *world_group;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_GET_FAILED_GROUP);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_GET_FAILED_GROUP);

    MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER, VERBOSE, "Getting failed group with %d as last acknowledged\n", last_rank);

    if (-1 == last_rank) {
        MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER, VERBOSE, "No failure acknowledged");
        *failed_group = MPIR_Group_empty;
        goto fn_exit;
    }

    if (*MPIDI_failed_procs_string == '\0') {
        MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER, VERBOSE, "Found no failed ranks");
        *failed_group = MPIR_Group_empty;
        goto fn_exit;
    }

    utarray_new(failed_procs, &ut_int_icd);

    /* parse list of failed processes.  This is a comma separated list
       of ranks or ranges of ranks (e.g., "1, 3-5, 11") */
    i = 0;
    c = MPIDI_failed_procs_string;
    while(1) {
        parse_rank(&rank);
        ++i;
        MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER, VERBOSE, "Found failed rank: %d", rank);
        utarray_push_back(failed_procs, &rank);
        MPIDI_last_known_failed = rank;
        MPIR_ERR_CHKINTERNAL(*c != ',' && *c != '\0', mpi_errno, "error parsing failed process list");
        if (*c == '\0' || last_rank == rank)
            break;
        ++c; /* skip ',' */
    }

    /* Create group of failed processes for comm_world.  Failed groups for other
       communicators can be created from this one using group_intersection. */
    mpi_errno = MPIR_Comm_group_impl(MPIR_Process.comm_world, &world_group);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

    mpi_errno = MPIR_Group_incl_impl(world_group, i, ut_int_array(failed_procs), failed_group);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

    mpi_errno = MPIR_Group_release(world_group);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_GET_FAILED_GROUP);
    if (failed_procs)
        utarray_free(failed_procs);
    return mpi_errno;
fn_oom:
    MPIR_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "utarray");
fn_fail:
    goto fn_exit;
}
Example #28
0
void MPIDI_DBG_Print_packet(MPIDI_CH3_Pkt_t * pkt)
{
    {
        MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE,"MPIDI_CH3_Pkt_t:\n");
        switch (pkt->type) {
        case MPIDI_CH3_PKT_EAGER_SEND:
            MPIDI_CH3_PktPrint_EagerSend(stdout, pkt);
            break;
        case MPIDI_CH3_PKT_READY_SEND:
            MPIDI_CH3_PktPrint_ReadySend(stdout, pkt);
            break;

        case MPIDI_CH3_PKT_EAGER_SYNC_SEND:
            MPIDI_CH3_PktPrint_EagerSyncSend(stdout, pkt);
            break;
        case MPIDI_CH3_PKT_EAGER_SYNC_ACK:
            MPIDI_CH3_PktPrint_EagerSyncAck(stdout, pkt);
            break;

        case MPIDI_CH3_PKT_RNDV_REQ_TO_SEND:
            MPIDI_CH3_PktPrint_RndvReqToSend(stdout, pkt);
            break;
        case MPIDI_CH3_PKT_RNDV_CLR_TO_SEND:
            MPIDI_CH3_PktPrint_RndvClrToSend(stdout, pkt);
            break;
        case MPIDI_CH3_PKT_RNDV_SEND:
            MPIDI_CH3_PktPrint_RndvSend(stdout, pkt);
            break;

        case MPIDI_CH3_PKT_CANCEL_SEND_REQ:
            MPIDI_CH3_PktPrint_CancelSendReq(stdout, pkt);
            break;
        case MPIDI_CH3_PKT_CANCEL_SEND_RESP:
            MPIDI_CH3_PktPrint_CancelSendResp(stdout, pkt);
            break;

            /* FIXME: Move these RMA descriptions into the RMA code files */
        case MPIDI_CH3_PKT_PUT:
            MPIDI_CH3_PktPrint_Put(stdout, pkt);
            break;
        case MPIDI_CH3_PKT_GET:
            MPIDI_CH3_PktPrint_Get(stdout, pkt);
            break;
        case MPIDI_CH3_PKT_GET_RESP:
            MPIDI_CH3_PktPrint_GetResp(stdout, pkt);
            break;
        case MPIDI_CH3_PKT_ACCUMULATE:
            MPIDI_CH3_PktPrint_Accumulate(stdout, pkt);
            break;
        case MPIDI_CH3_PKT_LOCK:
            MPIDI_CH3_PktPrint_Lock(stdout, pkt);
            break;
        case MPIDI_CH3_PKT_ACK:
            MPIDI_CH3_PktPrint_Ack(stdout, pkt);
            break;
        case MPIDI_CH3_PKT_LOCK_ACK:
            MPIDI_CH3_PktPrint_LockAck(stdout, pkt);
            break;
            /*
             * case MPIDI_CH3_PKT_SHARED_LOCK_OPS_DONE:
             * MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," type ......... MPIDI_CH3_PKT_SHARED_LOCK_OPS_DONE\n");
             * MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST," source ....... 0x%08X\n", pkt->shared_lock_ops_done.source_win_handle));
             * break;
             */
        case MPIDI_CH3_PKT_FLOW_CNTL_UPDATE:
            MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," FLOW_CNTRL_UPDATE\n");
            break;

        case MPIDI_CH3_PKT_CLOSE:
            MPIDI_CH3_PktPrint_Close(stdout, pkt);
            break;

        default:
            MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE," INVALID PACKET\n");
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE," unknown type ... %d\n", pkt->type);
            MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE,"  type .......... EAGER_SEND\n");
            MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST,"   sender_reqid . 0x%08X\n", pkt->eager_send.sender_req_id));
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE,"   context_id ... %d\n", pkt->eager_send.match.parts.context_id);
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE,"   data_sz ...... %d\n", pkt->eager_send.data_sz);
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE,"   tag .......... %d\n", pkt->eager_send.match.parts.tag);
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE,"   rank ......... %d\n", pkt->eager_send.match.parts.rank);
#ifdef MPID_USE_SEQUENCE_NUMBERS
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE,"   seqnum ....... %d\n", pkt->eager_send.seqnum);
#endif
            MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE,"  type .......... REQ_TO_SEND\n");
            MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST,"   sender_reqid . 0x%08X\n", pkt->rndv_req_to_send.sender_req_id));
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE,"   context_id ... %d\n",
                           pkt->rndv_req_to_send.match.parts.context_id);
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE,"   data_sz ...... %d\n", pkt->rndv_req_to_send.data_sz);
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE,"   tag .......... %d\n", pkt->rndv_req_to_send.match.parts.tag);
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE,"   rank ......... %d\n", pkt->rndv_req_to_send.match.parts.rank);
#ifdef MPID_USE_SEQUENCE_NUMBERS
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE,"   seqnum ....... %d\n", pkt->rndv_req_to_send.seqnum);
#endif
            MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE,"  type .......... CLR_TO_SEND\n");
            MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST,"   sender_reqid . 0x%08X\n", pkt->rndv_clr_to_send.sender_req_id));
            MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST,"   recvr_reqid .. 0x%08X\n", pkt->rndv_clr_to_send.receiver_req_id));
            MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE,"  type .......... RNDV_SEND\n");
            MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST,"   recvr_reqid .. 0x%08X\n", pkt->rndv_send.receiver_req_id));
            MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE,"  type .......... CANCEL_SEND\n");
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE,"   context_id ... %d\n",
                           pkt->cancel_send_req.match.parts.context_id);
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE,"   tag .......... %d\n", pkt->cancel_send_req.match.parts.tag);
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE,"   rank ......... %d\n", pkt->cancel_send_req.match.parts.rank);
            MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST,"   sender_reqid . 0x%08X\n", pkt->cancel_send_req.sender_req_id));
            MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,TERSE,"  type .......... CANCEL_SEND_RESP\n");
            MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TERSE,(MPL_DBG_FDEST,"   sender_reqid . 0x%08X\n", pkt->cancel_send_resp.sender_req_id));
            MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,TERSE,"   ack .......... %d\n", pkt->cancel_send_resp.ack);
            break;
        }
    }
}
Example #29
0
/* XXX DJG FIXME at some point this should poll, much like the newtcp module.
   But then we have that whole pollfd array to manage, which we don't really
   need until this proof-of-concept proves itself. */
int MPID_nem_lmt_vmsplice_progress(void)
{
    int mpi_errno = MPI_SUCCESS;
    struct lmt_vmsplice_node *prev = NULL;
    struct lmt_vmsplice_node *free_me = NULL;
    struct lmt_vmsplice_node *cur = outstanding_head;
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_LMT_VMSPLICE_PROGRESS);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_LMT_VMSPLICE_PROGRESS);
    
    while (cur) {
        int complete = 0;

        switch (MPIDI_Request_get_type(cur->req)) {
            case MPIDI_REQUEST_TYPE_RECV:
                mpi_errno = do_readv(cur->req, cur->pipe_fd, cur->req->dev.iov,
                                     &cur->req->dev.iov_offset,
                                     &cur->req->dev.iov_count, &complete);
                /* FIXME: set the error status of the req and complete it, rather than POP */
                if (mpi_errno) MPIR_ERR_POP(mpi_errno);
                break;
            case MPIDI_REQUEST_TYPE_SEND:
                mpi_errno = do_vmsplice(cur->req, cur->pipe_fd, cur->req->dev.iov,
                                        &cur->req->dev.iov_offset,
                                        &cur->req->dev.iov_count, &complete);
                /* FIXME: set the error status of the req and complete it, rather than POP */
                if (mpi_errno) MPIR_ERR_POP(mpi_errno);
                break;
            default:
                MPIR_ERR_INTERNALANDJUMP(mpi_errno, "unexpected request type");
                break;
        }

        if (complete) {
            MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, ".... complete");

            /* remove the node from the list */
            if (cur == outstanding_head) {
                outstanding_head = cur->next;
                prev = NULL;
                free_me = cur;
                cur = cur->next;
            }
            else {
                prev->next = cur->next;
                prev = cur;
                free_me = cur;
                cur = cur->next;
            }
            if (free_me) MPL_free(free_me);
            --MPID_nem_local_lmt_pending;
        }

        if (!cur) break; /* we might have made cur NULL above */

        prev = cur;
        cur = cur->next;
    }

fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_LMT_VMSPLICE_PROGRESS);
    return mpi_errno;
fn_fail:
    goto fn_exit;
}
Example #30
0
/*@
  MPIDI_CH3U_Handle_connection - handle connection event

Input Parameters:
+ vc - virtual connection
. event - connection event

  NOTE:
  This routine is used to transition the VC state.

  The only events currently handled are TERMINATED events.  This
  routine should be called (with TERMINATED) whenever a connection is
  terminated whether normally (in MPIDI_CH3_Connection_terminate() ),
  or abnormally.

  FIXME: Currently state transitions resulting from receiving CLOSE
  packets are performed in MPIDI_CH3_PktHandler_Close().  Perhaps that
  should move here.
@*/
int MPIDI_CH3U_Handle_connection(MPIDI_VC_t * vc, MPIDI_VC_Event_t event)
{
    int inuse;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_HANDLE_CONNECTION);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_HANDLE_CONNECTION);

    switch (event)
    {
	case MPIDI_VC_EVENT_TERMINATED:
	{
	    switch (vc->state)
	    {
		case MPIDI_VC_STATE_CLOSED:
                    /* Normal termination. */
                    MPIDI_CHANGE_VC_STATE(vc, INACTIVE);

		    /* MT: this is not thread safe */
		    MPIDI_Outstanding_close_ops -= 1;
		    MPL_DBG_MSG_D(MPIDI_CH3_DBG_DISCONNECT,TYPICAL,
             "outstanding close operations = %d", MPIDI_Outstanding_close_ops);
	    
		    if (MPIDI_Outstanding_close_ops == 0)
		    {
			MPIDI_CH3_Progress_signal_completion();
                        mpi_errno = MPIDI_CH3_Channel_close();
                        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
		    }

		    break;

                case MPIDI_VC_STATE_INACTIVE:
                    /* VC was terminated before it was activated.
                       This can happen if a failed process was
                       detected before the process used the VC. */
                    MPL_DBG_MSG(MPIDI_CH3_DBG_DISCONNECT,TYPICAL, "VC terminated before it was activated.  We probably got a failed"
                                 " process notification.");
                    MPIDI_CH3U_Complete_posted_with_error(vc);
                    ++MPIDI_Failed_vc_count;
                    MPIDI_CHANGE_VC_STATE(vc, MORIBUND);

                    break;

                    
                case MPIDI_VC_STATE_ACTIVE:
                case MPIDI_VC_STATE_REMOTE_CLOSE:
                    /* This is a premature termination.  This process
                       has not started the close protocol.  There may
                       be outstanding sends or receives on the local
                       side, remote side or both. */
                    
                    MPL_DBG_MSG(MPIDI_CH3_DBG_DISCONNECT,TYPICAL, "Connection closed prematurely.");

                    MPIDI_CH3U_Complete_posted_with_error(vc);
                    ++MPIDI_Failed_vc_count;
                    
                    MPIDU_Ftb_publish_vc(MPIDU_FTB_EV_UNREACHABLE, vc);
                    MPIDI_CHANGE_VC_STATE(vc, MORIBUND);

                    break;
                    
                case MPIDI_VC_STATE_LOCAL_CLOSE:
                    /* This is a premature termination.  This process
                       has started the close protocol, but hasn't
                       received a CLOSE packet from the remote side.
                       This process may not have been able to send the
                       CLOSE ack=F packet to the remote side.  There
                       may be outstanding sends or receives on the
                       local or remote sides. */
                case MPIDI_VC_STATE_CLOSE_ACKED:
                    /* This is a premature termination.  Both sides
                       have started the close protocol.  This process
                       has received CLOSE ack=F, but not CLOSE ack=t.
                       This process may not have been able to send
                       CLOSE ack=T.  There should not be any
                       outstanding sends or receives on either
                       side. */

                    MPL_DBG_MSG_D(MPIDI_CH3_DBG_DISCONNECT,TYPICAL, "Connection closed prematurely during close protocol.  "
                                   "Outstanding close operations = %d", MPIDI_Outstanding_close_ops);

                    MPIDI_CH3U_Complete_posted_with_error(vc);
                    ++MPIDI_Failed_vc_count;

                    MPIDU_Ftb_publish_vc(MPIDU_FTB_EV_UNREACHABLE, vc);
                    MPIDI_CHANGE_VC_STATE(vc, MORIBUND);
                    
		    /* MT: this is not thread safe */
		    MPIDI_Outstanding_close_ops -= 1;
	    
		    if (MPIDI_Outstanding_close_ops == 0) {
			MPIDI_CH3_Progress_signal_completion();
                        mpi_errno = MPIDI_CH3_Channel_close();
                        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
		    }
                    
                    break;

		default:
		{
		    MPL_DBG_MSG_D(MPIDI_CH3_DBG_DISCONNECT,TYPICAL, "Unhandled connection state %d when closing connection",vc->state);
		    mpi_errno = MPIR_Err_create_code(
			MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, 
                        MPI_ERR_INTERN, "**ch3|unhandled_connection_state",
			"**ch3|unhandled_connection_state %p %d", vc, vc->state);
                    goto fn_fail;
		    break;
		}
	    }

            /* FIXME: Decrement the reference count?  Who increments? */
            /* FIXME: The reference count is often already 0.  But
               not always */
            /* MPIR_Object_set_ref(vc, 0); ??? */

            /*
             * FIXME: The VC used in connect accept has a NULL 
             * process group
             */
            /* XXX DJG FIXME-MT should we be checking this ref_count? */
            if (vc->pg != NULL && (MPIR_Object_get_ref(vc) == 0))
            {
                /* FIXME: Who increments the reference count that
                   this is decrementing? */
                /* When the reference count for a vc becomes zero, 
                   decrement the reference count
                   of the associated process group.  */
                /* FIXME: This should be done when the reference 
                   count of the vc is first decremented */
                MPIDI_PG_release_ref(vc->pg, &inuse);
                if (inuse == 0) {
                    MPIDI_PG_Destroy(vc->pg);
                }
            }

	    break;
	}
    
	default:
	{
	    break;
	}
    }

fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_HANDLE_CONNECTION);
    return mpi_errno;
fn_fail:
    goto fn_exit;
}