int MPIDI_CH3_Finalize( void )
{
    int mpi_errno = MPI_SUCCESS;

    MPIDI_DBG_PRINTF((50, FCNAME, "entering"));
    MPIDI_DBG_PRINTF((50, FCNAME, "exiting"));

    MPIDI_STATE_DECL(MPID_STATE_MPID_CH3_FINALIZE);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_CH3_FINALIZE);

    mpi_errno = MPIDI_CH3I_Progress_finalize();
    if (mpi_errno) { MPIU_ERR_POP(mpi_errno); }

 fn_fail:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_CH3_FINALIZE);
    return mpi_errno;
}
/* MPIDI_CH3I_SendNoncontig - Sends a message by packing
   directly into cells.  The caller must initialize sreq->dev.segment
   as well as segment_first and segment_size. */
int MPIDI_CH3I_SendNoncontig( MPIDI_VC_t *vc, MPID_Request *sreq, void *header, MPIDI_msg_sz_t hdr_sz )
{
    int mpi_errno = MPI_SUCCESS;
    int again = 0;
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_SENDNONCONTIG);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_SENDNONCONTIG);

    MPIDI_DBG_Print_packet((MPIDI_CH3_Pkt_t *)header);

    MPIU_THREAD_CS_ENTER(MPIDCOMM,);

    if (!MPIDI_CH3I_Sendq_empty(MPIDI_CH3I_shm_sendq)) /* MT */
    {
        /* send queue is not empty, enqueue the request then check to
           see if we can send any now */

        MPIDI_DBG_PRINTF((55, FCNAME, "enqueuing"));

	sreq->dev.pending_pkt = *(MPIDI_CH3_Pkt_t *)header;
        sreq->ch.noncontig    = TRUE;
        sreq->ch.header_sz    = hdr_sz;
	sreq->ch.vc           = vc;

        MPIDI_CH3I_Sendq_enqueue(&MPIDI_CH3I_shm_sendq, sreq);
        mpi_errno = MPIDI_CH3I_Shm_send_progress();
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        goto fn_exit;
    }

    /* send as many cells of data as you can */
    MPID_nem_mpich_send_seg_header(sreq->dev.segment_ptr, &sreq->dev.segment_first, sreq->dev.segment_size, header, hdr_sz, vc, &again);
    while(!again && sreq->dev.segment_first < sreq->dev.segment_size)
        MPID_nem_mpich_send_seg(sreq->dev.segment_ptr, &sreq->dev.segment_first, sreq->dev.segment_size, vc, &again);

    if (again)
    {
        /* we didn't finish sending everything */
        sreq->ch.noncontig = TRUE;
        sreq->ch.vc = vc;
        if (sreq->dev.segment_first == 0) /* nothing was sent, save header */
        {
            sreq->dev.pending_pkt = *(MPIDI_CH3_Pkt_t *)header;
            sreq->ch.header_sz    = hdr_sz;
        }
        else
        {
            /* part of message was sent, make this req an active send */
            MPIU_Assert(MPIDI_CH3I_shm_active_send == NULL);
            MPIDI_CH3I_shm_active_send = sreq;
        }
        MPIDI_CH3I_Sendq_enqueue(&MPIDI_CH3I_shm_sendq, sreq);
        goto fn_exit;
    }

    /* finished sending all data, complete the request */
    if (!sreq->dev.OnDataAvail)
    {
        MPIU_Assert(MPIDI_Request_get_type(sreq) != MPIDI_REQUEST_TYPE_GET_RESP);
        MPIDI_CH3U_Request_complete(sreq);
        MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, ".... complete %d bytes", (int) (sreq->dev.segment_size));
    }
    else
    {
        int complete = 0;
        mpi_errno = sreq->dev.OnDataAvail(vc, sreq, &complete);
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        MPIU_Assert(complete); /* all data has been sent, we should always complete */

        MPIU_DBG_MSG_D(CH3_CHANNEL, VERBOSE, ".... complete %d bytes", (int) (sreq->dev.segment_size));
    }

 fn_exit:
    MPIU_THREAD_CS_EXIT(MPIDCOMM,);
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_SENDNONCONTIG);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
int MPIDI_CH3_Packetized_send(MPIDI_VC_t * vc, MPID_Request * sreq)
{
    MPIDI_CH3_Pkt_packetized_send_start_t send_start;
    MPIDI_CH3_Pkt_packetized_send_data_t pkt_head;

    vbuf *buf;
    int mpi_errno = MPI_SUCCESS;
    int n_iov;
    int msg_buffered = 0;
    int nb;
    int complete;
    int pkt_len;
    int seqnum;

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_SENDV);
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_SENDV);
    MPIU_DBG_PRINTF(("ch3_isendv\n"));
    MPIDI_DBG_PRINTF((50, FCNAME, "entering"));

    MPIDI_Pkt_init(&send_start, MPIDI_CH3_PKT_PACKETIZED_SEND_START);
    iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_Pkt_packetized_send_start_t);
    iov[0].MPID_IOV_BUF = (void*) &send_start;
    MPIU_Memcpy(&iov[1], sreq->dev.iov, sreq->dev.iov_count * sizeof(MPID_IOV));
    n_iov = 1 + sreq->dev.iov_count;

    GET_SEQ_NUM(sreq->dev.iov[0].MPID_IOV_BUF, seqnum);
    
    if (-1 == seqnum) {
        MPIDI_VC_FAI_send_seqnum(vc, seqnum);
    }
    MPIDI_Pkt_set_seqnum(&send_start, seqnum);
    MPIDI_Request_set_seqnum(sreq, seqnum);
    send_start.origin_head_size = sreq->dev.iov[0].MPID_IOV_LEN;

    Calculate_IOV_len(iov, n_iov, pkt_len);

    mpi_errno =
        MPIDI_CH3I_MRAILI_Eager_send(vc, iov, n_iov, pkt_len, &nb, &buf);
    DEBUG_PRINT("[pkt send] mpierr %d, nb %d\n", mpi_errno, nb);

    if (MPI_SUCCESS != mpi_errno && MPI_MRAIL_MSG_QUEUED != mpi_errno) {
        vc->ch.state = MPIDI_CH3I_VC_STATE_FAILED;
        sreq->status.MPI_ERROR = MPI_ERR_INTERN;
        MPIDI_CH3U_Request_complete(sreq);
        goto fn_exit;
    } else if (MPI_MRAIL_MSG_QUEUED == mpi_errno) {
        msg_buffered = 1;
    }
    nb -= sizeof(MPIDI_CH3_Pkt_packetized_send_start_t);

    MPIDI_Pkt_init(&pkt_head, MPIDI_CH3_PKT_PACKETIZED_SEND_DATA);
    iov[0].MPID_IOV_LEN = sizeof(MPIDI_CH3_Pkt_packetized_send_data_t);
    iov[0].MPID_IOV_BUF = (void*) &pkt_head;

    do {
        while (!MPIDI_CH3I_Request_adjust_iov(sreq, nb)) {
            MPIDI_VC_FAI_send_seqnum(vc, seqnum);
            MPIDI_Pkt_set_seqnum(&pkt_head, seqnum);
            MPIDI_Request_set_seqnum(sreq, seqnum);

            MPIU_Memcpy((void *) &iov[1],
                   &sreq->dev.iov[sreq->dev.iov_offset],
                   (sreq->dev.iov_count -
                    sreq->dev.iov_offset) * sizeof(MPID_IOV));
            n_iov = sreq->dev.iov_count - sreq->dev.iov_offset + 1;

            Calculate_IOV_len(iov, n_iov, pkt_len);

            mpi_errno =
                MPIDI_CH3I_MRAILI_Eager_send(vc, iov, n_iov, pkt_len, &nb,
                        &buf);
            DEBUG_PRINT("[istartmsgv] mpierr %d, nb %d\n", mpi_errno,
                    nb);
            MPIU_Assert(NULL == buf->sreq);

            if (MPI_SUCCESS != mpi_errno
                && MPI_MRAIL_MSG_QUEUED != mpi_errno) {
                vc->ch.state = MPIDI_CH3I_VC_STATE_FAILED;
                sreq->status.MPI_ERROR = MPI_ERR_INTERN;
                MPIDI_CH3U_Request_complete(sreq);
                goto fn_exit;
            } else if (MPI_MRAIL_MSG_QUEUED == mpi_errno) {
                msg_buffered = 1;
            }

            nb -= sizeof(MPIDI_CH3_Pkt_packetized_send_data_t);
        }
        if (sreq->dev.OnDataAvail == MPIDI_CH3_ReqHandler_SendReloadIOV) {
            MPIDI_CH3U_Handle_send_req(vc, sreq, &complete);
            nb = 0;
            complete = 0;
        } else {
            complete = 1;
        }
    } while (!complete);

    if (msg_buffered) {
        mpi_errno = MPI_MRAIL_MSG_QUEUED;
        buf->sreq = (void *) sreq;
    } else {
        MPIDI_CH3U_Handle_send_req(vc, sreq, &complete);
    }

  fn_exit:
    MPIDI_DBG_PRINTF((50, FCNAME, "exiting"));
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_SENDV);
    return mpi_errno;

}
int MPIDI_CH3_iStartMsg(MPIDI_VC_t * vc, void * hdr, MPIDI_msg_sz_t hdr_sz, MPID_Request ** sreq_ptr)
{
    MPID_Request * sreq = NULL;
    int mpi_errno = MPI_SUCCESS;
    int stream_no, ppid;
    MPIDI_CH3_Pkt_t* pkt;

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_ISTARTMSG);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_ISTARTMSG);
    
    MPIDI_DBG_PRINTF((50, FCNAME, "entering"));
#ifdef MPICH_DBG_OUTPUT
    /* --BEGIN ERROR HANDLING-- */
    if (hdr_sz > sizeof(MPIDI_CH3_Pkt_t))
    {
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**arg", 0);
	goto fn_fail;
    }
    /* --END ERROR HANDLING-- */
#endif

    /* The sctp channel uses a fixed length header, the size of which is the maximum of all possible packet headers */
    hdr_sz = sizeof(MPIDI_CH3_Pkt_t);
    MPIDI_DBG_Print_packet((MPIDI_CH3_Pkt_t*)hdr);

    /* find out stream no. */
    pkt = (MPIDI_CH3_Pkt_t*) hdr;
    stream_no = Req_Stream_from_pkt_and_req(pkt, *sreq_ptr);  /*  don't know pkt type here so pass it in */
    ppid = 0;

    if (SEND_CONNECTED(vc, stream_no) == MPIDI_CH3I_VC_STATE_CONNECTED)
    {
	/* Connection already formed.  If send queue is empty attempt to send data, queuing any unsent data. */
	if (!SEND_ACTIVE(vc, stream_no)) /* MT */
	{
	    MPIU_Assert(MPIDI_CH3I_SendQ_empty_x(vc, stream_no));

	    MPIU_Size_t nb;
	    int rc;

	    MPIDI_DBG_PRINTF((55, FCNAME, "send queue empty, attempting to write"));
	    
	    /* MT - need some signalling to lock down our right to use the channel, thus insuring that the progress engine does
               not also try to write */
	    rc = MPIDU_Sctp_write(vc, hdr, hdr_sz, stream_no, ppid, &nb);
	    
	    if (rc == MPI_SUCCESS)
	    {
		MPIDI_DBG_PRINTF((55, FCNAME, "wrote %ld bytes", (unsigned long) nb));
		
		if (nb == hdr_sz)
		{
		    MPIDI_DBG_PRINTF((55, FCNAME, "entire write complete, %d bytes", nb));
		    /* done.  get us out of here as quickly as possible. */
		}
		else
		{
		    MPIDI_DBG_PRINTF((55, FCNAME, "partial write of %d bytes, request enqueued at head", nb));
		    sreq = create_request(hdr, hdr_sz, nb);

		    /* --BEGIN ERROR HANDLING-- */
		    if (sreq == NULL)
		    {
			mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER,
							 "**nomem", 0);
			goto fn_fail;
		    }
		    /* --END ERROR HANDLING-- */
                    
		    /* put in in Global SendQ */
		    MPIDU_Sctp_post_write(vc, sreq, hdr_sz-nb, hdr_sz-nb, NULL, stream_no); 

		    MPIDI_DBG_PRINTF((55, FCNAME, "posting write, vc=0x%p, sreq=0x%08x", vc, sreq->handle));
		    
		    /* --BEGIN ERROR HANDLING-- */
		    if (mpi_errno != MPI_SUCCESS)
		    {
			mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER,
							 "**ch3|sock|postwrite", "ch3|sock|postwrite %p %p %p", /* FIXME change error code */
							 sreq, vc->ch, vc);
			goto fn_fail;
		    }
		    /* --END ERROR HANDLING-- */
		}
	    }
	    /* --BEGIN ERROR HANDLING-- */
	    else
	    {
		MPIDI_DBG_PRINTF((55, FCNAME, "ERROR - MPIDU_Sctp_write failed, rc=%d", rc));
		sreq = MPID_Request_create();
		if (sreq == NULL)
		{
		    mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0);
		    goto fn_fail;
		}
		sreq->kind = MPID_REQUEST_SEND;
		sreq->cc = 0;
		sreq->status.MPI_ERROR = MPI_ERR_INTERN;
	    }
	    /* --END ERROR HANDLING-- */
	}
	else
	{
	    MPIDI_DBG_PRINTF((55, FCNAME, "send in progress, request enqueued"));
	    sreq = create_request(hdr, hdr_sz, 0);
	    /* --BEGIN ERROR HANDLING-- */
	    if (sreq == NULL)
	    {
		mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0);
		goto fn_fail;
	    }
	    /* --END ERROR HANDLING-- */
	    MPIDI_CH3I_SendQ_enqueue_x(vc, sreq, stream_no);
	}
    }
    else if (SEND_CONNECTED(vc, stream_no) == MPIDI_CH3I_VC_STATE_UNCONNECTED) /* MT */
    {
	MPIDI_DBG_PRINTF((55, FCNAME, "unconnected.  posting connect and enqueuing request"));
	
	/* queue the data so it can be sent after the connection is formed */
	sreq = create_request(hdr, hdr_sz, 0);
	/* --BEGIN ERROR HANDLING-- */
	if (sreq == NULL)
	{
	    mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0);
	    goto fn_fail;
	}
	/* --END ERROR HANDLING-- */
        
	/* Form a new connection, called once per association (i.e. not per stream) */
        if(vc->ch.pkt == NULL)
        {
            mpi_errno = MPIDI_CH3I_VC_post_connect(vc);
            /* --BEGIN ERROR HANDLING-- */
            if (mpi_errno != MPI_SUCCESS)
            {
                mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**fail", 0);
                goto fn_fail;
            }
            /* --END ERROR HANDLING-- */
        }
        
	MPIDU_Sctp_stream_init(vc, sreq, stream_no);
    }
    else if (vc->ch.state != MPIDI_CH3I_VC_STATE_FAILED)
    {
	/* Unable to send data at the moment, so queue it for later */
	MPIDI_DBG_PRINTF((55, FCNAME, "forming connection, request enqueued"));
	sreq = create_request(hdr, hdr_sz, 0);
	/* --BEGIN ERROR HANDLING-- */
	if (sreq == NULL)
	{
	    mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0);
	    goto fn_fail;
	}
	/* --END ERROR HANDLING-- */
	MPIDU_Sctp_stream_init(vc, sreq, stream_no);
    }
    /* --BEGIN ERROR HANDLING-- */
    else
    {
	/* Connection failed, so allocate a request and return an error. */
	MPIDI_DBG_PRINTF((55, FCNAME, "ERROR - connection failed"));
	sreq = MPID_Request_create();
	if (sreq == NULL)
	{
	    mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0);
	    goto fn_fail;
	}
	sreq->kind = MPID_REQUEST_SEND;
	sreq->cc = 0;
	/* TODO: Create an appropriate error message */
	sreq->status.MPI_ERROR = MPI_ERR_INTERN;
    }
    /* --END ERROR HANDLING-- */

  fn_exit:
    *sreq_ptr = sreq;
    MPIDI_DBG_PRINTF((50, FCNAME, "exiting"));
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_ISTARTMSG);
    return mpi_errno;
 fn_fail:
    /* --BEGIN ERROR HANDLING-- */    
    goto fn_exit;
    /* --END ERROR HANDLING-- */    
}