Exemple #1
0
int MPIR_Ibsend_impl(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
                     MPID_Comm *comm_ptr, MPI_Request *request)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *request_ptr, *new_request_ptr;
    ibsend_req_info *ibinfo=0;

    /* We don't try tbsend in for MPI_Ibsend because we must create a
       request even if we can send the message */

    mpi_errno = MPIR_Bsend_isend( buf, count, datatype, dest, tag, comm_ptr,
                                  IBSEND, &request_ptr );
    if (mpi_errno != MPI_SUCCESS) goto fn_fail;

    /* FIXME: use the memory management macros */
    ibinfo = (ibsend_req_info *)MPIU_Malloc( sizeof(ibsend_req_info) );
    ibinfo->req       = request_ptr;
    ibinfo->cancelled = 0;
    mpi_errno = MPIR_Grequest_start_impl( MPIR_Ibsend_query, MPIR_Ibsend_free,
                                          MPIR_Ibsend_cancel, ibinfo, &new_request_ptr );
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
    /* The request is immediately complete because the MPIR_Bsend_isend has
       already moved the data out of the user's buffer */
    MPIR_Request_add_ref( request_ptr );
    /* Request count is now 2 (set to 1 in Grequest_start) */
    MPIR_Grequest_complete_impl(new_request_ptr);
    MPIU_OBJ_PUBLISH_HANDLE(*request, new_request_ptr->handle);

fn_exit:
    return mpi_errno;
fn_fail:
    goto fn_exit;
}
Exemple #2
0
int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint32_t rkey,
                                    void *write_to_buf)
{
    int mpi_errno = MPI_SUCCESS;
    int ibcom_errno;
    struct MPIDI_VC *vc = req->ch.vc;
    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);

    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);

    ibcom_errno =
        MPID_nem_ib_com_lrecv(vc_ib->sc->fd, (uint64_t) req, raddr, req->ch.lmt_data_sz, rkey,
                              write_to_buf);
    MPID_nem_ib_ncqe += 1;
    //dprintf("start_recv,ncqe=%d\n", MPID_nem_ib_ncqe);
    MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_lrecv");
    dprintf("lmt_start_recv_core,MPID_nem_ib_ncqe=%d\n", MPID_nem_ib_ncqe);
    dprintf
        ("lmt_start_recv_core,req=%p,sz=%ld,write_to_buf=%p,lmt_pack_buf=%p,user_buf=%p,raddr=%p,rkey=%08x,tail=%p=%02x\n",
         req, req->ch.lmt_data_sz, write_to_buf, REQ_FIELD(req, lmt_pack_buf), req->dev.user_buf,
         raddr, rkey, write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t),
         *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))));

#ifdef MPID_NEM_IB_LMT_GET_CQE
    MPID_nem_ib_ncqe_to_drain += 1;     /* use CQE instead of polling */
#else
    /* drain_scq and ib_poll is not ordered, so both can decrement ref_count */
    MPIR_Request_add_ref(req);

    /* register to poll list in ib_poll() */
    /* don't use req->dev.next because it causes unknown problem */
    MPID_nem_ib_lmtq_enqueue(&MPID_nem_ib_lmtq, req);
    dprintf("lmt_start_recv_core,lmtq enqueue\n");
    //volatile uint8_t* tailmagic = (uint8_t*)((void*)req->dev.user_buf + req->ch.lmt_data_sz - sizeof(uint8_t));
    //dprintf("start_recv_core,cur_tail=%02x,lmt_receiver_tail=%02x\n", *tailmagic, REQ_FIELD(req, lmt_receiver_tail));
#endif

  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Exemple #3
0
int MPII_Genutil_sched_start(MPII_Genutil_sched_t * sched, MPIR_Comm * comm, MPIR_Request ** req)
{
    int mpi_errno = MPI_SUCCESS;
    int is_complete;
    int made_progress;
    MPIR_Request *reqp;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPII_GENUTIL_SCHED_START);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPII_GENUTIL_SCHED_START);

    /* Create a request */
    reqp = MPIR_Request_create(MPIR_REQUEST_KIND__COLL);
    if (!reqp)
        MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
    *req = reqp;
    MPIR_Request_add_ref(reqp);

    /* Make some progress */
    mpi_errno = MPII_Genutil_sched_poke(sched, &is_complete, &made_progress);
    if (is_complete) {
        MPID_Request_complete(reqp);
        goto fn_exit;
    }

    /* Enqueue schedule and activate progress hook if not already activated */
    reqp->u.nbc.coll.sched = (void *) sched;
    if (coll_queue.head == NULL)
        MPID_Progress_activate_hook(MPII_Genutil_progress_hook_id);
    DL_APPEND(coll_queue.head, &(reqp->u.nbc.coll));

    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPII_GENUTIL_SCHED_START);

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
int ADD_SUFFIX(MPID_nem_ofi_iprobe_impl)(struct MPIDI_VC *vc,
                             int source,
                             int tag,
                             MPIR_Comm * comm,
                             int context_offset,
                             int *flag, MPI_Status * status, MPIR_Request ** rreq_ptr)
{
    int ret, mpi_errno = MPI_SUCCESS;
    fi_addr_t remote_proc = 0;
    uint64_t match_bits, mask_bits;
    size_t len;
    MPIR_Request rreq_s, *rreq;

    BEGIN_FUNC(FCNAME);
    if (rreq_ptr) {
        MPIDI_CH3I_NM_OFI_RC(MPID_nem_ofi_create_req(&rreq, 1));
        rreq->kind = MPIR_REQUEST_KIND__RECV;

        *rreq_ptr = rreq;
        rreq->comm = comm;
        rreq->dev.match.parts.rank = source;
        rreq->dev.match.parts.tag = tag;
        rreq->dev.match.parts.context_id = comm->context_id;
        MPIR_Comm_add_ref(comm);
    }
    else {
        rreq = &rreq_s;
        rreq->dev.OnDataAvail = NULL;
    }

    REQ_OFI(rreq)->pack_buffer    = NULL;
    REQ_OFI(rreq)->event_callback = ADD_SUFFIX(peek_callback);
    REQ_OFI(rreq)->match_state    = PEEK_INIT;
    OFI_ADDR_INIT(source, vc, remote_proc);
#if API_SET == API_SET_1
    match_bits = init_recvtag(&mask_bits, comm->context_id + context_offset, source, tag);
#elif API_SET == API_SET_2
    match_bits = init_recvtag_2(&mask_bits, comm->context_id + context_offset, tag);
#endif

    /* ------------------------------------------------------------------------- */
    /* fi_recvmsg with FI_PEEK:                                                  */
    /* Initiate a search for a match in the hardware or software queue.          */
    /* The search can complete immediately with -ENOMSG.                         */
    /* I successful, libfabric will enqueue a context entry into the completion  */
    /* queue to make the search nonblocking.  This code will poll until the      */
    /* entry is enqueued.                                                        */
    /* ------------------------------------------------------------------------- */
    msg_tagged_t msg;
    uint64_t     msgflags = FI_PEEK;
    msg.msg_iov   = NULL;
    msg.desc      = NULL;
    msg.iov_count = 0;
    msg.addr      = remote_proc;
    msg.tag       = match_bits;
    msg.ignore    = mask_bits;
    msg.context   = (void *) &(REQ_OFI(rreq)->ofi_context);
    msg.data      = 0;
    if(*flag == CLAIM_PEEK)
      msgflags|=FI_CLAIM;
    ret = fi_trecvmsg(gl_data.endpoint,&msg,msgflags);
    if(ret == -ENOMSG) {
      if (rreq_ptr) {
        MPIR_Request_free(rreq);
        *rreq_ptr = NULL;
        *flag = 0;
      }
      MPID_nem_ofi_poll(MPID_NONBLOCKING_POLL);
      goto fn_exit;
    }
    MPIR_ERR_CHKANDJUMP4((ret < 0), mpi_errno, MPI_ERR_OTHER,
                         "**ofi_peek", "**ofi_peek %s %d %s %s",
                         __SHORT_FILE__, __LINE__, FCNAME, fi_strerror(-ret));

    while (PEEK_INIT == REQ_OFI(rreq)->match_state)
        MPID_nem_ofi_poll(MPID_BLOCKING_POLL);

    if (PEEK_NOT_FOUND == REQ_OFI(rreq)->match_state) {
        if (rreq_ptr) {
            MPIR_Request_free(rreq);
            *rreq_ptr = NULL;
            *flag = 0;
        }
        MPID_nem_ofi_poll(MPID_NONBLOCKING_POLL);
        goto fn_exit;
    }

    if (status != MPI_STATUS_IGNORE)
        *status = rreq->status;

    MPIR_Request_add_ref(rreq);
    *flag = 1;
    END_FUNC_RC(FCNAME);
}
Exemple #5
0
int MPID_nem_ib_lmt_start_recv_core(struct MPID_Request *req, void *raddr, uint32_t rkey, long len,
                                    void *write_to_buf, uint32_t max_msg_sz, int end)
{
    int mpi_errno = MPI_SUCCESS;
    int ibcom_errno;
    struct MPIDI_VC *vc = req->ch.vc;
    MPID_nem_ib_vc_area *vc_ib = VC_IB(vc);
    int i;
    int divide;
    int posted_num;
    int last;
    uint32_t r_max_msg_sz;      /* responder's max_msg_sz */
    void *write_pos;
    void *addr;
    long data_sz;
    MPIDI_msg_sz_t rest_data_sz;

    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);

    MPID_nem_ib_com_get_info_conn(vc_ib->sc->fd, MPID_NEM_IB_COM_INFOKEY_PATTR_MAX_MSG_SZ,
                                  &r_max_msg_sz, sizeof(uint32_t));

    divide = (max_msg_sz + r_max_msg_sz - 1) / r_max_msg_sz;

    write_pos = write_to_buf;
    posted_num = 0;
    last = MPID_NEM_IB_LMT_PART_OF_SEGMENT;
    rest_data_sz = len;
    addr = raddr;

    for (i = 0; i < divide; i++) {
        if (i == divide - 1)
            data_sz = max_msg_sz - i * r_max_msg_sz;
        else
            data_sz = r_max_msg_sz;

        if (i == divide - 1) {
            if (end)
                last = MPID_NEM_IB_LMT_LAST_PKT;        /* last part of last segment packet */
            else
                last = MPID_NEM_IB_LMT_SEGMENT_LAST;    /* last part of this segment */

            /* last data may be smaller than initiator's max_msg_sz */
            if (rest_data_sz < max_msg_sz)
                data_sz = rest_data_sz;
        }

        ibcom_errno =
            MPID_nem_ib_com_lrecv(vc_ib->sc->fd, (uint64_t) req, addr, data_sz, rkey,
                                  write_pos, last);
        MPIU_ERR_CHKANDJUMP(ibcom_errno, mpi_errno, MPI_ERR_OTHER, "**MPID_nem_ib_com_lrecv");

        /* update position */
        write_pos = (void *) ((char *) write_pos + data_sz);
        addr = (void *) ((char *) addr + data_sz);

        /* update rest data size */
        rest_data_sz -= data_sz;

        /* count request number */
        posted_num++;
    }

    MPIU_Assert(rest_data_sz == 0);
    MPID_nem_ib_ncqe += posted_num;
    //dprintf("start_recv,ncqe=%d\n", MPID_nem_ib_ncqe);
    dprintf("lmt_start_recv_core,MPID_nem_ib_ncqe=%d\n", MPID_nem_ib_ncqe);
    dprintf
        ("lmt_start_recv_core,req=%p,sz=%ld,write_to_buf=%p,lmt_pack_buf=%p,user_buf=%p,raddr=%p,rkey=%08x,tail=%p=%02x\n",
         req, req->ch.lmt_data_sz, write_to_buf, REQ_FIELD(req, lmt_pack_buf), req->dev.user_buf,
         raddr, rkey, write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t),
         *((uint8_t *) (write_to_buf + req->ch.lmt_data_sz - sizeof(uint8_t))));
    //fflush(stdout);

#ifdef MPID_NEM_IB_LMT_GET_CQE
    MPID_nem_ib_ncqe_to_drain += posted_num;    /* use CQE instead of polling */
#else
    /* drain_scq and ib_poll is not ordered, so both can decrement ref_count */
    MPIR_Request_add_ref(req);

    /* register to poll list in ib_poll() */
    /* don't use req->dev.next because it causes unknown problem */
    MPID_nem_ib_lmtq_enqueue(&MPID_nem_ib_lmtq, req);
    dprintf("lmt_start_recv_core,lmtq enqueue\n");
    //volatile uint8_t* tailmagic = (uint8_t*)((void*)req->dev.user_buf + req->ch.lmt_data_sz - sizeof(uint8_t));
    //dprintf("start_recv_core,cur_tail=%02x,lmt_receiver_tail=%02x\n", *tailmagic, REQ_FIELD(req, lmt_receiver_tail));
#endif

  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_IB_LMT_START_RECV_CORE);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Exemple #6
0
int MPID_Put_generic(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
		     int target_rank, MPI_Aint target_disp, int target_count,
		     MPI_Datatype target_datatype, MPIR_Win *win_ptr, MPIR_Request **request)
{
	int mpi_error = MPI_SUCCESS;
	MPID_PSP_Datatype_info dt_info;
	MPID_PSP_packed_msg_t msg;
	MPID_Win_rank_info *ri = win_ptr->rank_info + target_rank;
	char *target_buf;
#if 0
	fprintf(stderr, "int MPID_Put(origin_addr: %p, origin_count: %d, origin_datatype: %08x,"
		" target_rank: %d, target_disp: %d, target_count: %d, target_datatype: %08x,"
		" *win_ptr: %p)\n",
		origin_addr, origin_count, origin_datatype,
		target_rank, target_disp, target_count,
		target_datatype, win_ptr);
#endif
	/* Datatype */
	MPID_PSP_Datatype_get_info(target_datatype, &dt_info);

	if(request) {
		*request = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
		(*request)->comm = win_ptr->comm_ptr;
		MPIR_Comm_add_ref(win_ptr->comm_ptr);
	}

	if (unlikely(target_rank == MPI_PROC_NULL)) {

		goto fn_completed;
	}


	/* Request-based RMA operations are only valid within a passive target epoch! */
	if(request && win_ptr->epoch_state != MPID_PSP_EPOCH_LOCK && win_ptr->epoch_state != MPID_PSP_EPOCH_LOCK_ALL) {
		mpi_error = MPI_ERR_RMA_SYNC;
		goto err_sync_rma;
	}

	/* Check that we are within an access/exposure epoch: */
	if (win_ptr->epoch_state == MPID_PSP_EPOCH_NONE) {
		mpi_error = MPI_ERR_RMA_SYNC;
		goto err_sync_rma;
	}

	/* Track access epoch state: */
	if (win_ptr->epoch_state == MPID_PSP_EPOCH_FENCE_ISSUED) {
		win_ptr->epoch_state = MPID_PSP_EPOCH_FENCE;
	}


	/* If the put is a local operation, do it here */
	if (target_rank == win_ptr->rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {
		void *base;
		int disp_unit;

		if (win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED) {

			MPID_PSP_shm_rma_get_base(win_ptr, target_rank, &disp_unit, &base);
		}
		else {
			base = win_ptr->base;
			disp_unit = win_ptr->disp_unit;
		}

		mpi_error = MPIR_Localcopy(origin_addr, origin_count, origin_datatype,
				     (char *) base + disp_unit * target_disp,
				     target_count, target_datatype);

		if (mpi_error) {
			goto err_local_copy;
		}

		goto fn_completed;
	}

	/* Data */
	mpi_error = MPID_PSP_packed_msg_prepare(origin_addr, origin_count, origin_datatype, &msg);
	if (unlikely(mpi_error != MPI_SUCCESS)) goto err_create_packed_msg;

	MPID_PSP_packed_msg_pack(origin_addr, origin_count, origin_datatype, &msg);

	target_buf = (char *) ri->base_addr + ri->disp_unit * target_disp;


	if (0 && MPID_PSP_Datatype_is_contig(&dt_info)) { /* ToDo: reenable pscom buildin rma_write */
		/* Contig message. Use pscom buildin rma */
		pscom_request_t *req = pscom_request_create(0, 0);

		req->data_len = msg.msg_sz;
		req->data = msg.msg;
		req->connection = ri->con;

		/* ToDo: need a new io_done. inside io_done, call MPID_PSP_packed_msg_cleanup(msg)!!! */
		req->ops.io_done = pscom_request_free;
		req->xheader.rma_write.dest = target_buf;

		pscom_post_rma_write(req);

		/* win_ptr->rma_puts_accs[target_rank]++; / ToDo: Howto receive this? */
	} else {
		unsigned int	encode_dt_size	= MPID_PSP_Datatype_get_size(&dt_info);
		unsigned int	xheader_len	= sizeof(MPID_PSCOM_XHeader_Rma_put_t) + encode_dt_size;
		pscom_request_t *req = pscom_request_create(xheader_len, sizeof(pscom_request_put_send_t));
		MPID_PSCOM_XHeader_Rma_put_t *xheader = &req->xheader.user.put;

		/* encoded datatype too large for xheader? */
		assert(xheader_len < (1<<(8*sizeof(((struct PSCOM_header_net*)0)->xheader_len))));

		req->user->type.put_send.msg = msg;
		req->user->type.put_send.win_ptr = win_ptr;

		MPID_PSP_Datatype_encode(&dt_info, &xheader->encoded_type);

		xheader->common.tag = 0;
		xheader->common.context_id = 0;
		xheader->common.type = MPID_PSP_MSGTYPE_RMA_PUT;
		xheader->common._reserved_ = 0;
		xheader->common.src_rank = win_ptr->rank;

		/* xheader->target_disp = target_disp; */
		xheader->target_count = target_count;
		xheader->target_buf = target_buf;
		/* xheader->epoch = ri->epoch_origin; */
		xheader->win_ptr = ri->win_ptr; /* remote win_ptr */

		req->xheader_len = xheader_len;

		req->data = msg.msg;
		req->data_len = msg.msg_sz;

		req->ops.io_done = rma_put_done;
		req->user->type.put_send.target_rank = target_rank;
		req->connection = ri->con;

		win_ptr->rma_local_pending_cnt++;
		win_ptr->rma_local_pending_rank[target_rank]++;
		win_ptr->rma_puts_accs[target_rank]++;

		if(request) {
			MPIR_Request *mpid_req = *request;
			/* TODO: Use a new and 'put_send'-dedicated MPID_DEV_Request_create() */
			/*       instead of allocating and overloading a common send request. */
			pscom_request_free(mpid_req->dev.kind.common.pscom_req);
			mpid_req->dev.kind.common.pscom_req = req;
			MPIR_Request_add_ref(mpid_req);
			req->user->type.put_send.mpid_req = mpid_req;
		} else {
			req->user->type.put_send.mpid_req = NULL;
		}

		pscom_post_send(req);
	}
fn_exit:
	return MPI_SUCCESS;
fn_completed:
	if(request) {
		MPIDI_PSP_Request_set_completed(*request);
	}
	return MPI_SUCCESS;
	/* --- */
err_exit:
	if(request) {
		MPIDI_PSP_Request_set_completed(*request);
		MPIR_Request_free(*request);
	}
	return mpi_error;
	/* --- */
err_create_packed_msg:
	goto err_exit;
err_local_copy:
	goto err_exit;
err_sync_rma:
	goto err_exit;
}
Exemple #7
0
int MPID_Cancel_send(MPIR_Request * sreq)
{
    MPIDI_VC_t * vc;
    int proto;
    int flag;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_CANCEL_SEND);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_CANCEL_SEND);
    
    MPIR_Assert(sreq->kind == MPIR_REQUEST_KIND__SEND);

    MPIDI_Request_cancel_pending(sreq, &flag);
    if (flag)
    {
	goto fn_exit;
    }

    /*
     * FIXME: user requests returned by MPI_Ibsend() have a NULL comm pointer
     * and no pointer to the underlying communication
     * request.  For now, we simply fail to cancel the request.  In the future,
     * we should add a new request kind to indicate that
     * the request is a BSEND.  Then we can properly cancel the request, much 
     * in the way we do persistent requests.
     */
    if (sreq->comm == NULL)
    {
	goto fn_exit;
    }

    MPIDI_Comm_get_vc_set_active(sreq->comm, sreq->dev.match.parts.rank, &vc);

    proto = MPIDI_Request_get_msg_type(sreq);

    if (proto == MPIDI_REQUEST_SELF_MSG)
    {
	MPIR_Request * rreq;
	
	MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,
		     "attempting to cancel message sent to self");
	
	MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_MSGQ_MUTEX);
	rreq = MPIDI_CH3U_Recvq_FDU(sreq->handle, &sreq->dev.match);
	MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_MSGQ_MUTEX);
	if (rreq)
	{
	    MPIR_Assert(rreq->dev.partner_request == sreq);
	    
	    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
             "send-to-self cancellation successful, sreq=0x%08x, rreq=0x%08x",
						sreq->handle, rreq->handle));

            /* Pull the message out of the unexpected queue since it's
             * being cancelled.  The below request release drops one
             * reference.  We explicitly drop a second reference,
             * because the receive request will never be visible to
             * the user. */
            MPIR_Request_free(rreq);
            MPIR_Request_free(rreq);

	    MPIR_STATUS_SET_CANCEL_BIT(sreq->status, TRUE);
            mpi_errno = MPID_Request_complete(sreq);
            if (mpi_errno != MPI_SUCCESS) {
                MPIR_ERR_POP(mpi_errno);
            }
	}
	else
	{
	    MPIR_STATUS_SET_CANCEL_BIT(sreq->status, FALSE);
	    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
               "send-to-self cancellation failed, sreq=0x%08x, rreq=0x%08x",
						sreq->handle, rreq->handle));
	}
	
	goto fn_exit;
    }

    /* If the message went over a netmod and it provides a cancel_send
       function, call it here. */
#ifdef ENABLE_COMM_OVERRIDES
    if (vc->comm_ops && vc->comm_ops->cancel_send)
    {
        mpi_errno = vc->comm_ops->cancel_send(vc, sreq);
        goto fn_exit;
    }
#endif

    /* Check to see if the send is still in the send queue.  If so, remove it, 
       mark the request and cancelled and complete, and
       release the device's reference to the request object.  
    */
    {
	int cancelled;
	
	if (proto == MPIDI_REQUEST_RNDV_MSG)
	{
	    MPIR_Request * rts_sreq;
	    /* The cancellation of the RTS request needs to be atomic through 
	       the destruction of the RTS request to avoid
               conflict with release of the RTS request if the CTS is received
	       (see handling of a rendezvous CTS packet in
               MPIDI_CH3U_Handle_recv_pkt()).  
	       MPID_Request_fetch_and_clear_rts_sreq() is used to gurantee 
	       that atomicity. */
	    MPIDI_Request_fetch_and_clear_rts_sreq(sreq, &rts_sreq);
	    if (rts_sreq != NULL) 
	    {
		cancelled = FALSE;
		
		/* since we attempted to cancel a RTS request, then we are 
		   responsible for releasing that request */
		MPIR_Request_free(rts_sreq);

		/* --BEGIN ERROR HANDLING-- */
		if (mpi_errno != MPI_SUCCESS)
		{
		    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER,
						     "**ch3|cancelrndv", 0);
		    goto fn_exit;
		}
		/* --END ERROR HANDLING-- */
		
		if (cancelled)
		{
		    MPIR_STATUS_SET_CANCEL_BIT(sreq->status, TRUE);
		    /* no other thread should be waiting on sreq, so it is 
		       safe to reset ref_count and cc */
                    MPIR_cc_set(&sreq->cc, 0);
                    /* FIXME should be a decr and assert, not a set */
		    MPIR_Object_set_ref(sreq, 1);
		    goto fn_exit;
		}
	    }
	}
	else
	{
	    cancelled = FALSE;
	    if (cancelled)
	    {
		MPIR_STATUS_SET_CANCEL_BIT(sreq->status, TRUE);
		/* no other thread should be waiting on sreq, so it is safe to 
		   reset ref_count and cc */
                MPIR_cc_set(&sreq->cc, 0);
                /* FIXME should be a decr and assert, not a set */
		MPIR_Object_set_ref(sreq, 1);
		goto fn_exit;
	    }
	}
    }

    /* Part or all of the message has already been sent, so we need to send a 
       cancellation request to the receiver in an attempt
       to catch the message before it is matched. */
    {
	int was_incomplete;
	MPIDI_CH3_Pkt_t upkt;
	MPIDI_CH3_Pkt_cancel_send_req_t * const csr_pkt = &upkt.cancel_send_req;
	MPIR_Request * csr_sreq;
	
	MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
              "sending cancel request to %d for 0x%08x", 
	      sreq->dev.match.parts.rank, sreq->handle));
	
	/* The completion counter and reference count are incremented to keep 
	   the request around long enough to receive a
	   response regardless of what the user does (free the request before 
	   waiting, etc.). */
	MPIDI_CH3U_Request_increment_cc(sreq, &was_incomplete);
	if (!was_incomplete)
	{
	    /* The reference count is incremented only if the request was 
	       complete before the increment. */
	    MPIR_Request_add_ref( sreq );
	}

	MPIDI_Pkt_init(csr_pkt, MPIDI_CH3_PKT_CANCEL_SEND_REQ);
	csr_pkt->match.parts.rank = sreq->comm->rank;
	csr_pkt->match.parts.tag = sreq->dev.match.parts.tag;
	csr_pkt->match.parts.context_id = sreq->dev.match.parts.context_id;
	csr_pkt->sender_req_id = sreq->handle;
	
	MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex);
	mpi_errno = MPIDI_CH3_iStartMsg(vc, csr_pkt, sizeof(*csr_pkt), &csr_sreq);
	MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex);
	if (mpi_errno != MPI_SUCCESS) {
	    MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|cancelreq");
	}
	if (csr_sreq != NULL)
	{
	    MPIR_Request_free(csr_sreq);
	}
    }
    
    /* FIXME: if send cancellation packets are allowed to arrive out-of-order 
       with respect to send packets, then we need to
       timestamp send and cancel packets to insure that a cancellation request 
       does not bypass the send packet to be cancelled
       and erroneously cancel a previously sent message with the same request 
       handle. */
    /* FIXME: A timestamp is more than is necessary; a message sequence number
       should be adequate. */
 fn_fail:
 fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_CANCEL_SEND);
    return mpi_errno;
}