Пример #1
0
/* Send a zero-sized message with eager synchronous.  This is a temporary
   routine, as we may want to replace this with a counterpart to the
   Eager Short message */
int MPIDI_CH3_EagerSyncZero(MPIR_Request **sreq_p, int rank, int tag,
			    MPIR_Comm * comm, int context_offset )
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_eager_sync_send_t * const es_pkt = &upkt.eager_sync_send;
    MPIDI_VC_t * vc;
    MPIR_Request *sreq = *sreq_p;
    
    MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,"sending zero length message");
    
    /* MT FIXME what are the two operations we are waiting for?  the send and
     * the sync response? */
    MPIR_cc_set(&sreq->cc, 2);
    MPIDI_Request_set_msg_type(sreq, MPIDI_REQUEST_EAGER_MSG);
    sreq->dev.OnDataAvail = 0;
    
    MPIDI_Pkt_init(es_pkt, MPIDI_CH3_PKT_EAGER_SYNC_SEND);
    es_pkt->match.parts.rank = comm->rank;
    es_pkt->match.parts.tag = tag;
    es_pkt->match.parts.context_id = comm->context_id + context_offset;
    es_pkt->sender_req_id = sreq->handle;
    es_pkt->data_sz = 0;
    
    MPIDI_Comm_get_vc_set_active(comm, rank, &vc);
    MPIDI_VC_FAI_send_seqnum(vc, seqnum);
    MPIDI_Pkt_set_seqnum(es_pkt, seqnum);
    MPIDI_Request_set_seqnum(sreq, seqnum);
    
    MPL_DBG_MSGPKT(vc,tag,es_pkt->match.parts.context_id,rank,(intptr_t)0,"EagerSync0");
    MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex);
    mpi_errno = MPIDI_CH3_iSend(vc, sreq, es_pkt, sizeof(*es_pkt));
    MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex);
    /* --BEGIN ERROR HANDLING-- */
    if (mpi_errno != MPI_SUCCESS)
    {
	MPIR_Request_free(sreq);
	*sreq_p = NULL;
        MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|eagermsg");
    }
    /* --END ERROR HANDLING-- */

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Пример #2
0
int MPID_Issend(const void * buf, int count, MPI_Datatype datatype, int rank, int tag, MPIR_Comm * comm, int context_offset,
		MPIR_Request ** request)
{
    intptr_t data_sz;
    int dt_contig;
    MPI_Aint dt_true_lb;
    MPIR_Datatype* dt_ptr;
    MPIR_Request * sreq;
    MPIDI_VC_t * vc=0;
#if defined(MPID_USE_SEQUENCE_NUMBERS)
    MPID_Seqnum_t seqnum;
#endif    
    int eager_threshold = -1;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_ISSEND);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_ISSEND);

    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
                 "rank=%d, tag=%d, context=%d", 
                 rank, tag, comm->context_id + context_offset));

    /* Check to make sure the communicator hasn't already been revoked */
    if (comm->revoked &&
            MPIR_AGREE_TAG != MPIR_TAG_MASK_ERROR_BITS(tag & ~MPIR_TAG_COLL_BIT) &&
            MPIR_SHRINK_TAG != MPIR_TAG_MASK_ERROR_BITS(tag & ~MPIR_TAG_COLL_BIT)) {
        MPIR_ERR_SETANDJUMP(mpi_errno,MPIX_ERR_REVOKED,"**revoked");
    }
    
    if (rank == comm->rank && comm->comm_kind != MPIR_COMM_KIND__INTERCOMM)
    {
	mpi_errno = MPIDI_Isend_self(buf, count, datatype, rank, tag, comm, context_offset, MPIDI_REQUEST_TYPE_SSEND, &sreq);
	goto fn_exit;
    }

    if (rank != MPI_PROC_NULL)
    {
       MPIDI_Comm_get_vc_set_active(comm, rank, &vc);
        /* this needs to come before the sreq is created, since the override */
        /* function is responsible for creating its own request */       
#ifdef ENABLE_COMM_OVERRIDES
       if (vc->comm_ops && vc->comm_ops->issend)
       {
	  mpi_errno = vc->comm_ops->issend( vc, buf, count, datatype, rank, tag, comm, context_offset, &sreq);
	  goto fn_exit;
       }
#endif
    }   
   
    MPIDI_Request_create_sreq(sreq, mpi_errno, goto fn_exit);
    MPIDI_Request_set_type(sreq, MPIDI_REQUEST_TYPE_SSEND);
    
    if (rank == MPI_PROC_NULL)
    {
	MPIR_Object_set_ref(sreq, 1);
        MPIR_cc_set(&sreq->cc, 0);
	goto fn_exit;
    }
    
    MPIDI_Datatype_get_info(count, datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);
    
    if (data_sz == 0)
    {
	mpi_errno = MPIDI_CH3_EagerSyncZero( &sreq, rank, tag, comm, 
					     context_offset );
	goto fn_exit;
    }

    MPIDI_CH3_GET_EAGER_THRESHOLD(&eager_threshold, comm, vc);

    if (data_sz + sizeof(MPIDI_CH3_Pkt_eager_sync_send_t) <= eager_threshold)
    {
	mpi_errno = MPIDI_CH3_EagerSyncNoncontigSend( &sreq, buf, count,
                                                      datatype, data_sz, 
                                                      dt_contig, dt_true_lb,
                                                      rank, tag, comm, 
                                                      context_offset );
	/* If we're not complete and this is a derived datatype
         * communication, then add a reference to the datatype */
	if (sreq && (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN)) {
	    sreq->dev.datatype_ptr = dt_ptr;
        MPIR_Datatype_ptr_add_ref(dt_ptr);
	}
    }
    else
    {
	/* Note that the sreq was created above */
	MPIDI_Request_set_msg_type(sreq, MPIDI_REQUEST_RNDV_MSG);
	mpi_errno = vc->rndvSend_fn( &sreq, buf, count, datatype, dt_contig,
                                     data_sz, dt_true_lb, rank, tag, comm, 
                                     context_offset );
	
	/* FIXME: fill temporary IOV or pack temporary buffer after send to 
	   hide some latency.  This requires synchronization
           because the CTS packet could arrive and be processed before the 
	   above iStartmsg completes (depending on the progress
           engine, threads, etc.). */
	
	if (sreq && dt_ptr != NULL)
	{
	    sreq->dev.datatype_ptr = dt_ptr;
        MPIR_Datatype_ptr_add_ref(dt_ptr);
	}
    }

  fn_exit:
    *request = sreq;
    
    MPL_DBG_STMT(MPIDI_CH3_DBG_OTHER,VERBOSE,
    {
	if (sreq != NULL) {
	    MPL_DBG_MSG_P(MPIDI_CH3_DBG_OTHER,VERBOSE,
			   "request allocated, handle=0x%08x", sreq->handle);
	}
    }
		  )
Пример #3
0
int MPIR_Init_thread(int *argc, char ***argv, int required, int *provided)
{
    int mpi_errno = MPI_SUCCESS;
    int has_args;
    int has_env;
    int thread_provided = 0;
    int exit_init_cs_on_failure = 0;
    MPIR_Info *info_ptr;
#if defined(MPICH_IS_THREADED)
    bool cs_initialized = false;
#endif

    /* The threading library must be initialized at the very beginning because
     * it manages all synchronization objects (e.g., mutexes) that will be
     * initialized later */
    {
        int thread_err;
        MPL_thread_init(&thread_err);
        if (thread_err)
            goto fn_fail;
    }

#ifdef HAVE_HWLOC
    MPIR_Process.bindset = hwloc_bitmap_alloc();
    hwloc_topology_init(&MPIR_Process.hwloc_topology);
    MPIR_Process.bindset_is_valid = 0;
    hwloc_topology_set_io_types_filter(MPIR_Process.hwloc_topology, HWLOC_TYPE_FILTER_KEEP_ALL);
    if (!hwloc_topology_load(MPIR_Process.hwloc_topology)) {
        MPIR_Process.bindset_is_valid =
            !hwloc_get_proc_cpubind(MPIR_Process.hwloc_topology, getpid(), MPIR_Process.bindset,
                                    HWLOC_CPUBIND_PROCESS);
    }
#endif

#ifdef HAVE_NETLOC
    MPIR_Process.network_attr.u.tree.node_levels = NULL;
    MPIR_Process.network_attr.network_endpoint = NULL;
    MPIR_Process.netloc_topology = NULL;
    MPIR_Process.network_attr.type = MPIR_NETLOC_NETWORK_TYPE__INVALID;
    if (strlen(MPIR_CVAR_NETLOC_NODE_FILE)) {
        mpi_errno =
            netloc_parse_topology(&MPIR_Process.netloc_topology, MPIR_CVAR_NETLOC_NODE_FILE);
        if (mpi_errno == NETLOC_SUCCESS) {
            MPIR_Netloc_parse_topology(MPIR_Process.netloc_topology, &MPIR_Process.network_attr);
        }
    }
#endif
    /* For any code in the device that wants to check for runtime
     * decisions on the value of isThreaded, set a provisional
     * value here. We could let the MPID_Init routine override this */
#if defined MPICH_IS_THREADED
    MPIR_ThreadInfo.isThreaded = required == MPI_THREAD_MULTIPLE;
#endif /* MPICH_IS_THREADED */

#if defined(MPICH_IS_THREADED)
    mpi_errno = thread_cs_init();
    cs_initialized = true;
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
#endif

    /* FIXME: Move to os-dependent interface? */
#ifdef HAVE_WINDOWS_H
    /* prevent the process from bringing up an error message window if mpich
     * asserts */
    _CrtSetReportMode(_CRT_ASSERT, _CRTDBG_MODE_FILE);
    _CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR);
    _CrtSetReportHook2(_CRT_RPTHOOK_INSTALL, assert_hook);
#ifdef _WIN64
    {
        /* FIXME: (Windows) This severly degrades performance but fixes alignment
         * issues with the datatype code. */
        /* Prevent misaligned faults on Win64 machines */
        UINT mode, old_mode;

        old_mode = SetErrorMode(SEM_NOALIGNMENTFAULTEXCEPT);
        mode = old_mode | SEM_NOALIGNMENTFAULTEXCEPT;
        SetErrorMode(mode);
    }
#endif
#endif

    /* We need this inorder to implement IS_THREAD_MAIN */
#if (MPICH_THREAD_LEVEL >= MPI_THREAD_SERIALIZED) && defined(MPICH_IS_THREADED)
    {
        MPID_Thread_self(&MPIR_ThreadInfo.master_thread);
    }
#endif

#ifdef HAVE_ERROR_CHECKING
    /* Because the PARAM system has not been initialized, temporarily
     * uncondtionally enable error checks.  Once the PARAM system is
     * initialized, this may be reset */
    MPIR_Process.do_error_checks = 1;
#else
    MPIR_Process.do_error_checks = 0;
#endif

    /* Initialize necessary subsystems and setup the predefined attribute
     * values.  Subsystems may change these values. */
    MPIR_Process.attrs.appnum = -1;
    MPIR_Process.attrs.host = MPI_PROC_NULL;
    MPIR_Process.attrs.io = MPI_PROC_NULL;
    MPIR_Process.attrs.lastusedcode = MPI_ERR_LASTCODE;
    MPIR_Process.attrs.universe = MPIR_UNIVERSE_SIZE_NOT_SET;
    MPIR_Process.attrs.wtime_is_global = 0;

    /* Set the functions used to duplicate attributes.  These are
     * when the first corresponding keyval is created */
    MPIR_Process.attr_dup = 0;
    MPIR_Process.attr_free = 0;

#ifdef HAVE_CXX_BINDING
    /* Set the functions used to call functions in the C++ binding
     * for reductions and attribute operations.  These are null
     * until a C++ operation is defined.  This allows the C code
     * that implements these operations to not invoke a C++ code
     * directly, which may force the inclusion of symbols known only
     * to the C++ compiler (e.g., under more non-GNU compilers, including
     * Solaris and IRIX). */
    MPIR_Process.cxx_call_op_fn = 0;

#endif

#ifdef HAVE_F08_BINDING
    MPIR_C_MPI_UNWEIGHTED = MPI_UNWEIGHTED;
    MPIR_C_MPI_WEIGHTS_EMPTY = MPI_WEIGHTS_EMPTY;
#endif

    /* This allows the device to select an alternative function for
     * dimsCreate */
    MPIR_Process.dimsCreate = 0;

    /* "Allocate" from the reserved space for builtin communicators and
     * (partially) initialize predefined communicators.  comm_parent is
     * intially NULL and will be allocated by the device if the process group
     * was started using one of the MPI_Comm_spawn functions. */
    MPIR_Process.comm_world = MPIR_Comm_builtin + 0;
    MPII_Comm_init(MPIR_Process.comm_world);
    MPIR_Process.comm_world->handle = MPI_COMM_WORLD;
    MPIR_Process.comm_world->context_id = 0 << MPIR_CONTEXT_PREFIX_SHIFT;
    MPIR_Process.comm_world->recvcontext_id = 0 << MPIR_CONTEXT_PREFIX_SHIFT;
    MPIR_Process.comm_world->comm_kind = MPIR_COMM_KIND__INTRACOMM;
    /* This initialization of the comm name could be done only when
     * comm_get_name is called */
    MPL_strncpy(MPIR_Process.comm_world->name, "MPI_COMM_WORLD", MPI_MAX_OBJECT_NAME);

    MPIR_Process.comm_self = MPIR_Comm_builtin + 1;
    MPII_Comm_init(MPIR_Process.comm_self);
    MPIR_Process.comm_self->handle = MPI_COMM_SELF;
    MPIR_Process.comm_self->context_id = 1 << MPIR_CONTEXT_PREFIX_SHIFT;
    MPIR_Process.comm_self->recvcontext_id = 1 << MPIR_CONTEXT_PREFIX_SHIFT;
    MPIR_Process.comm_self->comm_kind = MPIR_COMM_KIND__INTRACOMM;
    MPL_strncpy(MPIR_Process.comm_self->name, "MPI_COMM_SELF", MPI_MAX_OBJECT_NAME);

#ifdef MPID_NEEDS_ICOMM_WORLD
    MPIR_Process.icomm_world = MPIR_Comm_builtin + 2;
    MPII_Comm_init(MPIR_Process.icomm_world);
    MPIR_Process.icomm_world->handle = MPIR_ICOMM_WORLD;
    MPIR_Process.icomm_world->context_id = 2 << MPIR_CONTEXT_PREFIX_SHIFT;
    MPIR_Process.icomm_world->recvcontext_id = 2 << MPIR_CONTEXT_PREFIX_SHIFT;
    MPIR_Process.icomm_world->comm_kind = MPIR_COMM_KIND__INTRACOMM;
    MPL_strncpy(MPIR_Process.icomm_world->name, "MPI_ICOMM_WORLD", MPI_MAX_OBJECT_NAME);

    /* Note that these communicators are not ready for use - MPID_Init
     * will setup self and world, and icomm_world if it desires it. */
#endif

    MPIR_Process.comm_parent = NULL;

    /* Setup the initial communicator list in case we have
     * enabled the debugger message-queue interface */
    MPII_COMML_REMEMBER(MPIR_Process.comm_world);
    MPII_COMML_REMEMBER(MPIR_Process.comm_self);

    /* MPIU_Timer_pre_init(); */

    /* Wait for debugger to attach if requested. */
    if (MPIR_CVAR_DEBUG_HOLD) {
        volatile int hold = 1;
        while (hold)
#ifdef HAVE_USLEEP
            usleep(100);
#endif
        ;
    }
#if defined(HAVE_ERROR_CHECKING) && (HAVE_ERROR_CHECKING == MPID_ERROR_LEVEL_RUNTIME)
    MPIR_Process.do_error_checks = MPIR_CVAR_ERROR_CHECKING;
#endif

    /* define MPI as initialized so that we can use MPI functions within
     * MPID_Init if necessary */
    OPA_store_int(&MPIR_Process.mpich_state, MPICH_MPI_STATE__IN_INIT);

    /* We can't acquire any critical sections until this point.  Any
     * earlier the basic data structures haven't been initialized */
    MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    exit_init_cs_on_failure = 1;

    /* create MPI_INFO_NULL object */
    /* FIXME: Currently this info object is empty, we need to add data to this
     * as defined by the standard. */
    info_ptr = MPIR_Info_builtin + 1;
    info_ptr->handle = MPI_INFO_ENV;
    MPIR_Object_set_ref(info_ptr, 1);
    info_ptr->next = NULL;
    info_ptr->key = NULL;
    info_ptr->value = NULL;

#ifdef USE_MEMORY_TRACING
    MPL_trinit();
#endif

    /* Set the number of tag bits. The device may override this value. */
    MPIR_Process.tag_bits = MPIR_TAG_BITS_DEFAULT;

    /* Create complete request to return in the event of immediately complete
     * operations. Use a SEND request to cover all possible use-cases. */
    MPIR_Process.lw_req = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
    MPIR_ERR_CHKANDSTMT(MPIR_Process.lw_req == NULL, mpi_errno, MPIX_ERR_NOREQ, goto fn_fail,
                        "**nomemreq");
    MPIR_cc_set(&MPIR_Process.lw_req->cc, 0);

    mpi_errno = MPID_Init(argc, argv, required, &thread_provided, &has_args, &has_env);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    /* Initialize collectives infrastructure */
    mpi_errno = MPII_Coll_init();
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    /* Set tag_ub as function of tag_bits set by the device */
    MPIR_Process.attrs.tag_ub = MPIR_TAG_USABLE_BITS;

    /* Assert: tag_ub should be a power of 2 minus 1 */
    MPIR_Assert(((unsigned) MPIR_Process.
                 attrs.tag_ub & ((unsigned) MPIR_Process.attrs.tag_ub + 1)) == 0);

    /* Assert: tag_ub is at least the minimum asked for in the MPI spec */
    MPIR_Assert(MPIR_Process.attrs.tag_ub >= 32767);

    /* Capture the level of thread support provided */
    MPIR_ThreadInfo.thread_provided = thread_provided;
    if (provided)
        *provided = thread_provided;
#if defined MPICH_IS_THREADED
    MPIR_ThreadInfo.isThreaded = (thread_provided == MPI_THREAD_MULTIPLE);
#endif /* MPICH_IS_THREADED */

    /* FIXME: Define these in the interface.  Does Timer init belong here? */
    MPII_Timer_init(MPIR_Process.comm_world->rank, MPIR_Process.comm_world->local_size);
#ifdef USE_MEMORY_TRACING
#ifdef MPICH_IS_THREADED
    MPL_trconfig(MPIR_Process.comm_world->rank, MPIR_ThreadInfo.isThreaded);
#else
    MPL_trconfig(MPIR_Process.comm_world->rank, 0);
#endif
    /* Indicate that we are near the end of the init step; memory
     * allocated already will have an id of zero; this helps
     * separate memory leaks in the initialization code from
     * leaks in the "active" code */
#endif
#ifdef MPL_USE_DBG_LOGGING
    /* FIXME: This is a hack to handle the common case of two worlds.
     * If the parent comm is not NULL, we always give the world number
     * as "1" (false). */
#ifdef MPICH_IS_THREADED
    MPL_dbg_init(argc, argv, has_args, has_env,
                 MPIR_Process.comm_parent != NULL, MPIR_Process.comm_world->rank,
                 MPIR_ThreadInfo.isThreaded);
#else
    MPL_dbg_init(argc, argv, has_args, has_env,
                 MPIR_Process.comm_parent != NULL, MPIR_Process.comm_world->rank, 0);
#endif

    MPIR_DBG_INIT = MPL_dbg_class_alloc("INIT", "init");
    MPIR_DBG_PT2PT = MPL_dbg_class_alloc("PT2PT", "pt2pt");
    MPIR_DBG_THREAD = MPL_dbg_class_alloc("THREAD", "thread");
    MPIR_DBG_DATATYPE = MPL_dbg_class_alloc("DATATYPE", "datatype");
    MPIR_DBG_HANDLE = MPL_dbg_class_alloc("HANDLE", "handle");
    MPIR_DBG_COMM = MPL_dbg_class_alloc("COMM", "comm");
    MPIR_DBG_BSEND = MPL_dbg_class_alloc("BSEND", "bsend");
    MPIR_DBG_ERRHAND = MPL_dbg_class_alloc("ERRHAND", "errhand");
    MPIR_DBG_OTHER = MPL_dbg_class_alloc("OTHER", "other");
    MPIR_DBG_REQUEST = MPL_dbg_class_alloc("REQUEST", "request");
    MPIR_DBG_COLL = MPL_dbg_class_alloc("COLL", "coll");

    MPIR_DBG_ASSERT = MPL_dbg_class_alloc("ASSERT", "assert");
    MPIR_DBG_STRING = MPL_dbg_class_alloc("STRING", "string");
#endif

    /* Initialize the C versions of the Fortran link-time constants.
     *
     * We now initialize the Fortran symbols from within the Fortran
     * interface in the routine that first needs the symbols.
     * This fixes a problem with symbols added by a Fortran compiler that
     * are not part of the C runtime environment (the Portland group
     * compilers would do this)
     */
#if defined(HAVE_FORTRAN_BINDING) && defined(HAVE_MPI_F_INIT_WORKS_WITH_C)
    mpirinitf_();
#endif

    /* FIXME: Does this need to come before the call to MPID_InitComplete?
     * For some debugger support, MPII_Wait_for_debugger may want to use
     * MPI communication routines to collect information for the debugger */
#ifdef HAVE_DEBUGGER_SUPPORT
    MPII_Wait_for_debugger();
#endif

    /* Let the device know that the rest of the init process is completed */
    if (mpi_errno == MPI_SUCCESS)
        mpi_errno = MPID_InitCompleted();

    MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    /* Make fields of MPIR_Process global visible and set mpich_state
     * atomically so that MPI_Initialized() etc. are thread safe */
    OPA_write_barrier();
    OPA_store_int(&MPIR_Process.mpich_state, MPICH_MPI_STATE__POST_INIT);
    return mpi_errno;

  fn_fail:
    /* --BEGIN ERROR HANDLING-- */
    /* signal to error handling routines that core services are unavailable */
    OPA_store_int(&MPIR_Process.mpich_state, MPICH_MPI_STATE__PRE_INIT);

    if (exit_init_cs_on_failure) {
        MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    }
#if defined(MPICH_IS_THREADED)
    if (cs_initialized) {
        MPIR_Thread_CS_Finalize();
    }
#endif
    return mpi_errno;
    /* --END ERROR HANDLING-- */
}
Пример #4
0
int MPIDI_CH3_iStartMsgv(MPIDI_VC_t * vc, MPL_IOV * iov, int n_iov, MPIR_Request ** sreq_ptr)
{
    MPIR_Request *sreq = NULL;
    MPIDI_CH3I_VC *vcch = &vc->ch;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_ISTARTMSGV);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3_ISTARTMSGV);

    MPIR_Assert(n_iov <= MPL_IOV_LIMIT);

    /* The SOCK channel uses a fixed length header, the size of which is the
     * maximum of all possible packet headers */
    iov[0].MPL_IOV_LEN = sizeof(MPIDI_CH3_Pkt_t);
    MPL_DBG_STMT(MPIDI_CH3_DBG_CHANNEL, VERBOSE,
                 MPIDI_DBG_Print_packet((MPIDI_CH3_Pkt_t *) iov[0].MPL_IOV_BUF));

    if (vcch->state == MPIDI_CH3I_VC_STATE_CONNECTED) { /* MT */
        /* Connection already formed.  If send queue is empty attempt to send
         * data, queuing any unsent data. */
        if (MPIDI_CH3I_SendQ_empty(vcch)) {     /* MT */
            int rc;
            size_t nb;

            MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, "send queue empty, attempting to write");
            MPL_DBG_PKT(vcch->conn, (MPIDI_CH3_Pkt_t *) iov[0].MPL_IOV_BUF, "isend");

            /* MT - need some signalling to lock down our right to use the
             * channel, thus insuring that the progress engine does
             * also try to write */
            rc = MPIDI_CH3I_Sock_writev(vcch->sock, iov, n_iov, &nb);
            if (rc == MPI_SUCCESS) {
                int offset = 0;

                MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL, VERBOSE,
                              "wrote %ld bytes", (unsigned long) nb);

                while (offset < n_iov) {
                    if (nb >= (int) iov[offset].MPL_IOV_LEN) {
                        nb -= iov[offset].MPL_IOV_LEN;
                        offset++;
                    } else {
                        MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE,
                                    "partial write, request enqueued at head");
                        sreq = create_request(iov, n_iov, offset, nb);
                        if (sreq == NULL) {
                            MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
                        }
                        MPIDI_CH3I_SendQ_enqueue_head(vcch, sreq);
                        MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CHANNEL, VERBOSE,
                                        (MPL_DBG_FDEST, "posting writev, vc=0x%p, sreq=0x%08x", vc,
                                         sreq->handle));
                        vcch->conn->send_active = sreq;
                        mpi_errno =
                            MPIDI_CH3I_Sock_post_writev(vcch->conn->sock, sreq->dev.iov + offset,
                                                        sreq->dev.iov_count - offset, NULL);
                        /* --BEGIN ERROR HANDLING-- */
                        if (mpi_errno != MPI_SUCCESS) {
                            mpi_errno =
                                MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, __func__, __LINE__,
                                                     MPI_ERR_OTHER, "**ch3|sock|postwrite",
                                                     "ch3|sock|postwrite %p %p %p", sreq,
                                                     vcch->conn, vc);
                        }
                        /* --END ERROR HANDLING-- */
                        break;
                    }
                }

                if (offset == n_iov) {
                    MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, "entire write complete");
                }
            }
            /* --BEGIN ERROR HANDLING-- */
            else {
                MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL, TYPICAL,
                              "ERROR - MPIDI_CH3I_Sock_writev failed, rc=%d", rc);
                sreq = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
                if (sreq == NULL) {
                    MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
                }
                MPIR_cc_set(&(sreq->cc), 0);
                sreq->status.MPI_ERROR = MPIR_Err_create_code(rc,
                                                              MPIR_ERR_RECOVERABLE, __func__,
                                                              __LINE__, MPI_ERR_INTERN,
                                                              "**ch3|sock|writefailed",
                                                              "**ch3|sock|writefailed %d", rc);
                /* Make sure that the caller sees this error */
                mpi_errno = sreq->status.MPI_ERROR;
            }
            /* --END ERROR HANDLING-- */
        } else {
            MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL, VERBOSE, "send in progress, request enqueued");
            sreq = create_request(iov, n_iov, 0, 0);
            if (sreq == NULL) {
                MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
            }
            MPIDI_CH3I_SendQ_enqueue(vcch, sreq);
        }
    } else if (vcch->state == MPIDI_CH3I_VC_STATE_CONNECTING) {
        MPL_DBG_VCUSE(vc, "connecting.  enqueuing request");

        /* queue the data so it can be sent after the connection is formed */
        sreq = create_request(iov, n_iov, 0, 0);
        if (sreq == NULL) {
            MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
        }
        MPIDI_CH3I_SendQ_enqueue(vcch, sreq);
    } else if (vcch->state == MPIDI_CH3I_VC_STATE_UNCONNECTED) {
        MPL_DBG_VCUSE(vc, "unconnected.  posting connect and enqueuing request");

        /* queue the data so it can be sent after the connection is formed */
        sreq = create_request(iov, n_iov, 0, 0);
        if (sreq == NULL) {
            MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
        }
        MPIDI_CH3I_SendQ_enqueue(vcch, sreq);

        /* Form a new connection */
        MPIDI_CH3I_VC_post_connect(vc);
    } else if (vcch->state != MPIDI_CH3I_VC_STATE_FAILED) {
        /* Unable to send data at the moment, so queue it for later */
        MPL_DBG_VCUSE(vc, "forming connection, request enqueued");
        sreq = create_request(iov, n_iov, 0, 0);
        if (sreq == NULL) {
            MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
        }
        MPIDI_CH3I_SendQ_enqueue(vcch, sreq);
    }
    /* --BEGIN ERROR HANDLING-- */
    else {
        /* Connection failed, so allocate a request and return an error. */
        MPL_DBG_VCUSE(vc, "ERROR - connection failed");
        sreq = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);
        if (sreq == NULL) {
            MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**nomem");
        }
        MPIR_cc_set(&(sreq->cc), 0);
        sreq->status.MPI_ERROR = MPIR_Err_create_code(MPI_SUCCESS,
                                                      MPIR_ERR_RECOVERABLE, __func__, __LINE__,
                                                      MPI_ERR_INTERN, "**ch3|sock|connectionfailed",
                                                      0);
        /* Make sure that the caller sees this error */
        mpi_errno = sreq->status.MPI_ERROR;
    }
    /* --END ERROR HANDLING-- */

  fn_fail:
    *sreq_ptr = sreq;
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3_ISTARTMSGV);
    return mpi_errno;
}
Пример #5
0
int MPID_Cancel_send(MPIR_Request * sreq)
{
    MPIDI_VC_t * vc;
    int proto;
    int flag;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_CANCEL_SEND);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_CANCEL_SEND);
    
    MPIR_Assert(sreq->kind == MPIR_REQUEST_KIND__SEND);

    MPIDI_Request_cancel_pending(sreq, &flag);
    if (flag)
    {
	goto fn_exit;
    }

    /*
     * FIXME: user requests returned by MPI_Ibsend() have a NULL comm pointer
     * and no pointer to the underlying communication
     * request.  For now, we simply fail to cancel the request.  In the future,
     * we should add a new request kind to indicate that
     * the request is a BSEND.  Then we can properly cancel the request, much 
     * in the way we do persistent requests.
     */
    if (sreq->comm == NULL)
    {
	goto fn_exit;
    }

    MPIDI_Comm_get_vc_set_active(sreq->comm, sreq->dev.match.parts.rank, &vc);

    proto = MPIDI_Request_get_msg_type(sreq);

    if (proto == MPIDI_REQUEST_SELF_MSG)
    {
	MPIR_Request * rreq;
	
	MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER,VERBOSE,
		     "attempting to cancel message sent to self");
	
	MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_MSGQ_MUTEX);
	rreq = MPIDI_CH3U_Recvq_FDU(sreq->handle, &sreq->dev.match);
	MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_MSGQ_MUTEX);
	if (rreq)
	{
	    MPIR_Assert(rreq->dev.partner_request == sreq);
	    
	    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
             "send-to-self cancellation successful, sreq=0x%08x, rreq=0x%08x",
						sreq->handle, rreq->handle));

            /* Pull the message out of the unexpected queue since it's
             * being cancelled.  The below request release drops one
             * reference.  We explicitly drop a second reference,
             * because the receive request will never be visible to
             * the user. */
            MPIR_Request_free(rreq);
            MPIR_Request_free(rreq);

	    MPIR_STATUS_SET_CANCEL_BIT(sreq->status, TRUE);
            mpi_errno = MPID_Request_complete(sreq);
            if (mpi_errno != MPI_SUCCESS) {
                MPIR_ERR_POP(mpi_errno);
            }
	}
	else
	{
	    MPIR_STATUS_SET_CANCEL_BIT(sreq->status, FALSE);
	    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
               "send-to-self cancellation failed, sreq=0x%08x, rreq=0x%08x",
						sreq->handle, rreq->handle));
	}
	
	goto fn_exit;
    }

    /* If the message went over a netmod and it provides a cancel_send
       function, call it here. */
#ifdef ENABLE_COMM_OVERRIDES
    if (vc->comm_ops && vc->comm_ops->cancel_send)
    {
        mpi_errno = vc->comm_ops->cancel_send(vc, sreq);
        goto fn_exit;
    }
#endif

    /* Check to see if the send is still in the send queue.  If so, remove it, 
       mark the request and cancelled and complete, and
       release the device's reference to the request object.  
    */
    {
	int cancelled;
	
	if (proto == MPIDI_REQUEST_RNDV_MSG)
	{
	    MPIR_Request * rts_sreq;
	    /* The cancellation of the RTS request needs to be atomic through 
	       the destruction of the RTS request to avoid
               conflict with release of the RTS request if the CTS is received
	       (see handling of a rendezvous CTS packet in
               MPIDI_CH3U_Handle_recv_pkt()).  
	       MPID_Request_fetch_and_clear_rts_sreq() is used to gurantee 
	       that atomicity. */
	    MPIDI_Request_fetch_and_clear_rts_sreq(sreq, &rts_sreq);
	    if (rts_sreq != NULL) 
	    {
		cancelled = FALSE;
		
		/* since we attempted to cancel a RTS request, then we are 
		   responsible for releasing that request */
		MPIR_Request_free(rts_sreq);

		/* --BEGIN ERROR HANDLING-- */
		if (mpi_errno != MPI_SUCCESS)
		{
		    mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER,
						     "**ch3|cancelrndv", 0);
		    goto fn_exit;
		}
		/* --END ERROR HANDLING-- */
		
		if (cancelled)
		{
		    MPIR_STATUS_SET_CANCEL_BIT(sreq->status, TRUE);
		    /* no other thread should be waiting on sreq, so it is 
		       safe to reset ref_count and cc */
                    MPIR_cc_set(&sreq->cc, 0);
                    /* FIXME should be a decr and assert, not a set */
		    MPIR_Object_set_ref(sreq, 1);
		    goto fn_exit;
		}
	    }
	}
	else
	{
	    cancelled = FALSE;
	    if (cancelled)
	    {
		MPIR_STATUS_SET_CANCEL_BIT(sreq->status, TRUE);
		/* no other thread should be waiting on sreq, so it is safe to 
		   reset ref_count and cc */
                MPIR_cc_set(&sreq->cc, 0);
                /* FIXME should be a decr and assert, not a set */
		MPIR_Object_set_ref(sreq, 1);
		goto fn_exit;
	    }
	}
    }

    /* Part or all of the message has already been sent, so we need to send a 
       cancellation request to the receiver in an attempt
       to catch the message before it is matched. */
    {
	int was_incomplete;
	MPIDI_CH3_Pkt_t upkt;
	MPIDI_CH3_Pkt_cancel_send_req_t * const csr_pkt = &upkt.cancel_send_req;
	MPIR_Request * csr_sreq;
	
	MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
              "sending cancel request to %d for 0x%08x", 
	      sreq->dev.match.parts.rank, sreq->handle));
	
	/* The completion counter and reference count are incremented to keep 
	   the request around long enough to receive a
	   response regardless of what the user does (free the request before 
	   waiting, etc.). */
	MPIDI_CH3U_Request_increment_cc(sreq, &was_incomplete);
	if (!was_incomplete)
	{
	    /* The reference count is incremented only if the request was 
	       complete before the increment. */
	    MPIR_Request_add_ref( sreq );
	}

	MPIDI_Pkt_init(csr_pkt, MPIDI_CH3_PKT_CANCEL_SEND_REQ);
	csr_pkt->match.parts.rank = sreq->comm->rank;
	csr_pkt->match.parts.tag = sreq->dev.match.parts.tag;
	csr_pkt->match.parts.context_id = sreq->dev.match.parts.context_id;
	csr_pkt->sender_req_id = sreq->handle;
	
	MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex);
	mpi_errno = MPIDI_CH3_iStartMsg(vc, csr_pkt, sizeof(*csr_pkt), &csr_sreq);
	MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex);
	if (mpi_errno != MPI_SUCCESS) {
	    MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**ch3|cancelreq");
	}
	if (csr_sreq != NULL)
	{
	    MPIR_Request_free(csr_sreq);
	}
    }
    
    /* FIXME: if send cancellation packets are allowed to arrive out-of-order 
       with respect to send packets, then we need to
       timestamp send and cancel packets to insure that a cancellation request 
       does not bypass the send packet to be cancelled
       and erroneously cancel a previously sent message with the same request 
       handle. */
    /* FIXME: A timestamp is more than is necessary; a message sequence number
       should be adequate. */
 fn_fail:
 fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_CANCEL_SEND);
    return mpi_errno;
}
Пример #6
0
void
MPID_Request_set_completed(MPID_Request *req)
{
  MPIR_cc_set(&req->cc, 0);
  MPIDI_Progress_signal();
}
Пример #7
0
/* MPIDI_CH3_EagerSyncNoncontigSend - Eagerly send noncontiguous data in
   synchronous mode.

   Some implementations may choose to use Rendezvous sends (see ch3u_rndv.c)
   for all Synchronous sends (MPI_Issend and MPI_Ssend).  An eager 
   synchronous send eliminates one of the handshake messages, but 
   most application codes should not be using synchronous sends in
   performance-critical operations.
*/
int MPIDI_CH3_EagerSyncNoncontigSend( MPIR_Request **sreq_p,
				      const void * buf, int count, 
				      MPI_Datatype datatype, intptr_t data_sz,
				      int dt_contig, MPI_Aint dt_true_lb,
				      int rank, 
				      int tag, MPIR_Comm * comm,
				      int context_offset )
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_CH3_Pkt_t upkt;
    MPIDI_CH3_Pkt_eager_sync_send_t * const es_pkt = &upkt.eager_sync_send;
    MPIDI_VC_t * vc;
    MPIR_Request *sreq = *sreq_p;

    /* MT FIXME what are the two operations we are waiting for?  the send and
     * the sync response? */
    MPIR_cc_set(&sreq->cc, 2);
    sreq->dev.OnDataAvail = 0;
    sreq->dev.OnFinal = 0;

    MPIDI_Pkt_init(es_pkt, MPIDI_CH3_PKT_EAGER_SYNC_SEND);
    es_pkt->match.parts.rank = comm->rank;
    es_pkt->match.parts.tag = tag;
    es_pkt->match.parts.context_id = comm->context_id + context_offset;
    es_pkt->sender_req_id = sreq->handle;
    es_pkt->data_sz = data_sz;

    MPIDI_Comm_get_vc_set_active(comm, rank, &vc);
    
    MPIDI_VC_FAI_send_seqnum(vc, seqnum);
    MPIDI_Pkt_set_seqnum(es_pkt, seqnum);
    MPIDI_Request_set_seqnum(sreq, seqnum);
    
    MPL_DBG_MSGPKT(vc,tag,es_pkt->match.parts.context_id,rank,data_sz,"EagerSync");

    if (dt_contig)
    {
        MPL_IOV iov[2];
	MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,VERBOSE,(MPL_DBG_FDEST,
                                            "sending contiguous sync eager message, data_sz=%" PRIdPTR,
					    data_sz));
	
        iov[0].MPL_IOV_BUF = (MPL_IOV_BUF_CAST)es_pkt;
        iov[0].MPL_IOV_LEN = sizeof(*es_pkt);
	iov[1].MPL_IOV_BUF = (MPL_IOV_BUF_CAST) ((char *)buf + dt_true_lb);
	iov[1].MPL_IOV_LEN = data_sz;	
	
	MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex);
	mpi_errno = MPIDI_CH3_iSendv(vc, sreq, iov, 2);
	MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex);
	/* --BEGIN ERROR HANDLING-- */
	if (mpi_errno != MPI_SUCCESS)
	{
        /* Make sure to destroy the request before setting the pointer to
         * NULL, otherwise we lose the handle on the request */
            MPIR_Request_free(sreq);
	    *sreq_p = NULL;
            MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ch3|eagermsg");
	}
	/* --END ERROR HANDLING-- */
    }
    else
    {
	MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER,VERBOSE,
		       "sending non-contiguous sync eager message, data_sz=%" PRIdPTR,
		       data_sz);
	
	sreq->dev.segment_ptr = MPIDU_Segment_alloc( );
        MPIR_ERR_CHKANDJUMP1((sreq->dev.segment_ptr == NULL), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPIDU_Segment_alloc");

	MPIDU_Segment_init(buf, count, datatype, sreq->dev.segment_ptr, 0);
	sreq->dev.segment_first = 0;
	sreq->dev.segment_size = data_sz;
	
	MPID_THREAD_CS_ENTER(POBJ, vc->pobj_mutex);
        mpi_errno = vc->sendNoncontig_fn(vc, sreq, es_pkt, sizeof(MPIDI_CH3_Pkt_eager_sync_send_t));
	MPID_THREAD_CS_EXIT(POBJ, vc->pobj_mutex);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    }

 fn_exit:
    return mpi_errno;
 fn_fail:
    *sreq_p = NULL;
    goto fn_exit;
}
Пример #8
0
int MPIDI_CH3_iStartMsg(MPIDI_VC_t * vc, void * hdr, intptr_t hdr_sz,
                        MPIR_Request ** sreq_ptr)
{
    MPIR_Request * sreq = NULL;
    MPIDI_CH3I_VC *vcch = &vc->ch;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_ISTARTMSG);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3_ISTARTMSG);

    MPIR_Assert( hdr_sz <= sizeof(MPIDI_CH3_Pkt_t));

    /* The SOCK channel uses a fixed length header, the size of which is the
       maximum of all possible packet headers */
    hdr_sz = sizeof(MPIDI_CH3_Pkt_t);
    MPL_DBG_STMT(MPIDI_CH3_DBG_CHANNEL,VERBOSE,
                 MPIDI_DBG_Print_packet((MPIDI_CH3_Pkt_t*)hdr));

    if (vcch->state == MPIDI_CH3I_VC_STATE_CONNECTED) /* MT */
    {
        /* Connection already formed.  If send queue is empty attempt to send
           data, queuing any unsent data. */
        if (MPIDI_CH3I_SendQ_empty(vcch)) /* MT */
        {
            size_t nb;
            int rc;

            MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL,VERBOSE,
                        "send queue empty, attempting to write");

            MPL_DBG_PKT(vcch->conn,hdr,"istartmsg");
            /* MT: need some signalling to lock down our right to use the
               channel, thus insuring that the progress engine does
                   not also try to write */
            rc = MPIDI_CH3I_Sock_write(vcch->sock, hdr, hdr_sz, &nb);
            if (rc == MPI_SUCCESS)
            {
                MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL,VERBOSE,
                              "wrote %ld bytes", (unsigned long) nb);

                if (nb == hdr_sz)
                {
                    MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL,VERBOSE,
                                  "entire write complete, %" PRIdPTR " bytes", nb);
                    /* done.  get us out of here as quickly as possible. */
                }
                else
                {
                    MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL,VERBOSE,
                                  "partial write of %" PRIdPTR " bytes, request enqueued at head", nb);
                    sreq = create_request(hdr, hdr_sz, nb);
                    if (!sreq) {
                        MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomem");
                    }

                    MPIDI_CH3I_SendQ_enqueue_head(vcch, sreq);
                    MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CHANNEL,VERBOSE,
                                    (MPL_DBG_FDEST,"posting write, vc=0x%p, sreq=0x%08x", vc, sreq->handle));
                    vcch->conn->send_active = sreq;
                    mpi_errno = MPIDI_CH3I_Sock_post_write(vcch->conn->sock, sreq->dev.iov[0].MPL_IOV_BUF,
                                                           sreq->dev.iov[0].MPL_IOV_LEN, sreq->dev.iov[0].MPL_IOV_LEN, NULL);
                    /* --BEGIN ERROR HANDLING-- */
                    if (mpi_errno != MPI_SUCCESS)
                    {
                        mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER,
                                                         "**ch3|sock|postwrite", "ch3|sock|postwrite %p %p %p",
                                                         sreq, vcch->conn, vc);
                        goto fn_fail;
                    }
                    /* --END ERROR HANDLING-- */
                }
            }
            /* --BEGIN ERROR HANDLING-- */
            else
            {
                MPL_DBG_MSG_D(MPIDI_CH3_DBG_CHANNEL,TYPICAL,
                              "ERROR - MPIDI_CH3I_Sock_write failed, rc=%d", rc);
                sreq = MPIR_Request_create(MPIR_REQUEST_KIND__UNDEFINED);
                if (!sreq) {
                    MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomem");
                }
                sreq->kind = MPIR_REQUEST_KIND__SEND;
                MPIR_cc_set(&(sreq->cc), 0);
                sreq->status.MPI_ERROR = MPIR_Err_create_code( rc,
                                         MPIR_ERR_RECOVERABLE, FCNAME, __LINE__,
                                         MPI_ERR_INTERN, "**ch3|sock|writefailed",
                                         "**ch3|sock|writefailed %d", rc );
                /* Make sure that the caller sees this error */
                mpi_errno = sreq->status.MPI_ERROR;
            }
            /* --END ERROR HANDLING-- */
        }
        else
        {
            MPL_DBG_MSG(MPIDI_CH3_DBG_CHANNEL,VERBOSE,
                        "send in progress, request enqueued");
            sreq = create_request(hdr, hdr_sz, 0);
            if (!sreq) {
                MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomem");
            }
            MPIDI_CH3I_SendQ_enqueue(vcch, sreq);
        }
    }
    else if (vcch->state == MPIDI_CH3I_VC_STATE_CONNECTING) /* MT */
    {
        MPL_DBG_VCUSE(vc,
                      "connecteding. enqueuing request");

        /* queue the data so it can be sent after the connection is formed */
        sreq = create_request(hdr, hdr_sz, 0);
        if (!sreq) {
            MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomem");
        }
        MPIDI_CH3I_SendQ_enqueue(vcch, sreq);
    }
    else if (vcch->state == MPIDI_CH3I_VC_STATE_UNCONNECTED) /* MT */
    {
        MPL_DBG_VCUSE(vc,
                      "unconnected.  posting connect and enqueuing request");

        /* queue the data so it can be sent after the connection is formed */
        sreq = create_request(hdr, hdr_sz, 0);
        if (!sreq) {
            MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomem");
        }
        MPIDI_CH3I_SendQ_enqueue(vcch, sreq);

        /* Form a new connection */
        MPIDI_CH3I_VC_post_connect(vc);
    }
    else if (vcch->state != MPIDI_CH3I_VC_STATE_FAILED)
    {
        /* Unable to send data at the moment, so queue it for later */
        MPL_DBG_VCUSE(vc,"forming connection, request enqueued");
        sreq = create_request(hdr, hdr_sz, 0);
        if (!sreq) {
            MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomem");
        }
        MPIDI_CH3I_SendQ_enqueue(vcch, sreq);
    }
    /* --BEGIN ERROR HANDLING-- */
    else
    {
        /* Connection failed, so allocate a request and return an error. */
        MPL_DBG_VCUSE(vc,"ERROR - connection failed");
        sreq = MPIR_Request_create(MPIR_REQUEST_KIND__UNDEFINED);
        if (!sreq) {
            MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomem");
        }
        sreq->kind = MPIR_REQUEST_KIND__SEND;
        MPIR_cc_set(&sreq->cc, 0);

        sreq->status.MPI_ERROR = MPIR_Err_create_code( MPI_SUCCESS,
                                 MPIR_ERR_RECOVERABLE, FCNAME, __LINE__,
                                 MPI_ERR_INTERN, "**ch3|sock|connectionfailed",0 );
        /* Make sure that the caller sees this error */
        mpi_errno = sreq->status.MPI_ERROR;
    }
    /* --END ERROR HANDLING-- */

fn_fail:
    *sreq_ptr = sreq;
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3_ISTARTMSG);
    return mpi_errno;
}