Ejemplo n.º 1
0
int MPID_nem_mxm_vc_terminate(MPIDI_VC_t * vc)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_nem_mxm_vc_area *vc_area = VC_BASE(vc);

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MXM_VC_TERMINATE);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MXM_VC_TERMINATE);

    if (vc->state != MPIDI_VC_STATE_CLOSED) {
        /* VC is terminated as a result of a fault.  Complete
         * outstanding sends with an error and terminate connection
         * immediately. */
        MPIR_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank);
    }
    else {
        while (vc_area->pending_sends > 0)
            MPID_nem_mxm_poll(FALSE);
    }

    mpi_errno = MPIDI_CH3U_Handle_connection(vc, MPIDI_VC_EVENT_TERMINATED);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MXM_VC_TERMINATE);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Ejemplo n.º 2
0
int MPIR_MINLOC_check_dtype( MPI_Datatype type )
{
    int mpi_errno = MPI_SUCCESS;
    
    switch (type) {
    /* first the C types */
    case MPI_2INT: 
    case MPI_FLOAT_INT: 
    case MPI_LONG_INT: 
    case MPI_SHORT_INT: 
    case MPI_DOUBLE_INT: 
#if defined(HAVE_LONG_DOUBLE)
    case MPI_LONG_DOUBLE_INT: 
#endif
   /* now the Fortran types */
#ifdef HAVE_FORTRAN_BINDING
#ifndef HAVE_NO_FORTRAN_MPI_TYPES_IN_C
    case MPI_2INTEGER: 
    case MPI_2REAL: 
    case MPI_2DOUBLE_PRECISION: 
#endif
#endif
        break;

    default: MPIR_ERR_SET1(mpi_errno, MPI_ERR_OP, "**opundefined", "**opundefined %s", "MPI_MINLOC");
    }
    
    return mpi_errno;
}
Ejemplo n.º 3
0
int vc_terminate(MPIDI_VC_t *vc)
{
    int mpi_errno = MPI_SUCCESS;
    int req_errno = MPI_SUCCESS;
    MPID_nem_ptl_vc_area *const vc_ptl = VC_PTL(vc);
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_VC_TERMINATE);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_VC_TERMINATE);

     if (vc->state != MPIDI_VC_STATE_CLOSED) {
        /* VC is terminated as a result of a fault.  Complete
           outstanding sends with an error and terminate
           connection immediately. */
        MPIR_ERR_SET1(req_errno, MPIX_ERR_PROC_FAILED, "**comm_fail", "**comm_fail %d", vc->pg_rank);
        mpi_errno = MPID_nem_ptl_vc_terminated(vc);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
     } else if (vc_ptl->num_queued_sends == 0) {
         mpi_errno = MPID_nem_ptl_vc_terminated(vc);
         if (mpi_errno) MPIR_ERR_POP(mpi_errno);
     } else {
         /* the send_queued function will call vc_terminated if vc->state is
            CLOSED and the last queued send has been sent*/
     }
    
 fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_VC_TERMINATE);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Ejemplo n.º 4
0
int MPID_NS_Lookup(MPID_NS_Handle handle, const MPIR_Info * info_ptr,
                   const char service_name[], char port[])
{
    FILE *fp;
    char filename[MAXPATHLEN];
    int mpi_errno = MPI_SUCCESS;

    /* Determine file and directory name.  The file name is from
     * the service name */
    MPL_strncpy(filename, handle->dirname, MAXPATHLEN);
    MPL_strnapp(filename, service_name, MAXPATHLEN);

    fp = fopen(filename, "r");
    if (!fp) {
        /* --BEGIN ERROR HANDLING-- */
        port[0] = 0;
        MPIR_ERR_SET1(mpi_errno, MPI_ERR_NAME,
                      "**namepubnotpub", "**namepubnotpub %s", service_name);
        /* --END ERROR HANDLING-- */
    } else {
        /* The first line is the name, the second is the
         * process that published. We just read the name */
        if (!fgets(port, MPI_MAX_PORT_NAME, fp)) {
            /* --BEGIN ERROR HANDLING-- */
            port[0] = 0;
            MPIR_ERR_SET1(mpi_errno, MPI_ERR_NAME,
                          "**namepubnotfound", "**namepubnotfound %s", service_name);
            /* --END ERROR HANDLING-- */
        } else {
            char *nl;
            /* Remove the newline, if any.  We use fgets instead of fscanf
             * to allow port names to contain blanks */
            nl = strchr(port, '\n');
            if (nl)
                *nl = 0;
            /* printf("Read %s from %s\n", port, filename); */
        }
        fclose(fp);
    }
    return mpi_errno;
}
Ejemplo n.º 5
0
void MPIR_MINLOC( 
	void *invec, 
	void *inoutvec, 
	int *Len, 
	MPI_Datatype *type )
{
    int mpi_errno = MPI_SUCCESS;
    int i, len = *Len;
    
#ifdef HAVE_FORTRAN_BINDING
#ifndef HAVE_NO_FORTRAN_MPI_TYPES_IN_C
    int flen = len * 2; /* used for Fortran types */
#endif
#endif

    switch (*type) {
    /* first the C types */
    case MPI_2INT:       MPIR_MINLOC_C_CASE(MPIR_2int_loctype);        
    case MPI_FLOAT_INT:  MPIR_MINLOC_C_CASE(MPIR_floatint_loctype);
    case MPI_LONG_INT:   MPIR_MINLOC_C_CASE(MPIR_longint_loctype);
    case MPI_SHORT_INT:  MPIR_MINLOC_C_CASE(MPIR_shortint_loctype);
    case MPI_DOUBLE_INT: MPIR_MINLOC_C_CASE(MPIR_doubleint_loctype);
#if defined(HAVE_LONG_DOUBLE)
    case MPI_LONG_DOUBLE_INT: MPIR_MINLOC_C_CASE(MPIR_longdoubleint_loctype);
#endif

    /* now the Fortran types */
#ifdef HAVE_FORTRAN_BINDING
#ifndef HAVE_NO_FORTRAN_MPI_TYPES_IN_C
    case MPI_2INTEGER:          MPIR_MINLOC_F_CASE(MPI_Fint);
    case MPI_2REAL:             MPIR_MINLOC_F_CASE(MPIR_FC_REAL_CTYPE);
    case MPI_2DOUBLE_PRECISION: MPIR_MINLOC_F_CASE(MPIR_FC_DOUBLE_CTYPE);
#endif
#endif
	/* --BEGIN ERROR HANDLING-- */
    default: {
        MPIR_ERR_SET1(mpi_errno, MPI_ERR_OP, "**opundefined","**opundefined %s", "MPI_MINLOC" );
        {
            MPIR_Per_thread_t *per_thread = NULL;
            int err = 0;

            MPID_THREADPRIV_KEY_GET_ADDR(MPIR_ThreadInfo.isThreaded, MPIR_Per_thread_key,
                                         MPIR_Per_thread, per_thread, &err);
            MPIR_Assert(err == 0);
            per_thread->op_errno = mpi_errno;
        }
        break;
    }
	/* --END ERROR HANDLING-- */
    }

}
Ejemplo n.º 6
0
void MPIR_MINLOC( 
	void *invec, 
	void *inoutvec, 
	int *Len, 
	MPI_Datatype *type )
{
    int mpi_errno = MPI_SUCCESS;
    int i, len = *Len;
    
#ifdef HAVE_FORTRAN_BINDING
#ifndef HAVE_NO_FORTRAN_MPI_TYPES_IN_C
    int flen = len * 2; /* used for Fortran types */
#endif
#endif

    switch (*type) {
    /* first the C types */
    case MPI_2INT:       MPIR_MINLOC_C_CASE(MPIR_2int_loctype);        
    case MPI_FLOAT_INT:  MPIR_MINLOC_C_CASE(MPIR_floatint_loctype);
    case MPI_LONG_INT:   MPIR_MINLOC_C_CASE(MPIR_longint_loctype);
    case MPI_SHORT_INT:  MPIR_MINLOC_C_CASE(MPIR_shortint_loctype);
    case MPI_DOUBLE_INT: MPIR_MINLOC_C_CASE(MPIR_doubleint_loctype);
#if defined(HAVE_LONG_DOUBLE)
    case MPI_LONG_DOUBLE_INT: MPIR_MINLOC_C_CASE(MPIR_longdoubleint_loctype);
#endif

    /* now the Fortran types */
#ifdef HAVE_FORTRAN_BINDING
#ifndef HAVE_NO_FORTRAN_MPI_TYPES_IN_C
    case MPI_2INTEGER:          MPIR_MINLOC_F_CASE(MPI_Fint);
    case MPI_2REAL:             MPIR_MINLOC_F_CASE(MPIR_FC_REAL_CTYPE);
    case MPI_2DOUBLE_PRECISION: MPIR_MINLOC_F_CASE(MPIR_FC_DOUBLE_CTYPE);
#endif
#endif
	/* --BEGIN ERROR HANDLING-- */
    default: {
	MPID_THREADPRIV_DECL;
	MPID_THREADPRIV_GET;
        MPIR_ERR_SET1(mpi_errno, MPI_ERR_OP, "**opundefined","**opundefined %s", "MPI_MINLOC" );
        MPID_THREADPRIV_FIELD(op_errno) = mpi_errno;
        break;
    }
	/* --END ERROR HANDLING-- */
    }

}
Ejemplo n.º 7
0
int MPID_nem_tcp_vc_terminate(MPIDI_VC_t *vc)
{
    int mpi_errno = MPI_SUCCESS;
    int req_errno = MPI_SUCCESS;
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_TCP_VC_TERMINATE);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_TCP_VC_TERMINATE);

    if (vc->state != MPIDI_VC_STATE_CLOSED) {
        /* VC is terminated as a result of a fault.  Complete
           outstanding sends with an error and terminate
           connection immediately. */
        MPIR_ERR_SET1(req_errno, MPIX_ERR_PROC_FAILED, "**comm_fail", "**comm_fail %d", vc->pg_rank);
        mpi_errno = MPID_nem_tcp_error_out_send_queue(vc, req_errno);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
        mpi_errno = MPID_nem_tcp_vc_terminated(vc);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    } else {
        MPID_nem_tcp_vc_area *vc_tcp = VC_TCP(vc);
        /* VC is terminated as a result of the close protocol.
           Wait for sends to complete, then terminate. */

        if (MPIDI_CH3I_Sendq_empty(vc_tcp->send_queue)) {
            /* The sendq is empty, so we can immediately terminate
               the connection. */
            mpi_errno = MPID_nem_tcp_vc_terminated(vc);
            if (mpi_errno) MPIR_ERR_POP(mpi_errno);
        }
        /* else: just return.  We'll call vc_terminated() from the
           commrdy_handler once the sendq is empty. */
    }

 fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_TCP_VC_TERMINATE);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Ejemplo n.º 8
0
/*@ 
   MPIDI_PG_Finalize - Finalize the process groups, including freeing all
   process group structures
  @*/
int MPIDI_PG_Finalize(void)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_PG_t *pg, *pgNext;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_PG_FINALIZE);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_PG_FINALIZE);

    /* Print the state of the process groups */
    if (verbose) {
	MPIU_PG_Printall( stdout );
    }

    /* FIXME - straighten out the use of PMI_Finalize - no use after 
       PG_Finalize */
    if (pg_world->connData) {
#ifdef USE_PMI2_API
        mpi_errno = PMI2_Finalize();
        if (mpi_errno) MPIR_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**ch3|pmi_finalize");
#else
	int rc;
	rc = PMI_Finalize();
	if (rc) {
	    MPIR_ERR_SET1(mpi_errno,MPI_ERR_OTHER, 
			  "**ch3|pmi_finalize", 
			  "**ch3|pmi_finalize %d", rc);
	}
#endif
    }

    /* Free the storage associated with the process groups */
    pg = MPIDI_PG_list;
    while (pg) {
	pgNext = pg->next;
	
	/* In finalize, we free all process group information, even if
	   the ref count is not zero.  This can happen if the user
	   fails to use MPI_Comm_disconnect on communicators that
	   were created with the dynamic process routines.*/
        /* XXX DJG FIXME-MT should we be checking this? */
	if (MPIR_Object_get_ref(pg) == 0 || 1) {
	    if (pg == MPIDI_Process.my_pg)
		MPIDI_Process.my_pg = NULL;

	    MPIR_Object_set_ref(pg, 0); /* satisfy assertions in PG_Destroy */
	    MPIDI_PG_Destroy( pg );
	}
	pg     = pgNext;
    }

    /* If COMM_WORLD is still around (it normally should be), 
       try to free it here.  The reason that we need to free it at this 
       point is that comm_world (and comm_self) still exist, and 
       hence the usual process to free the related VC structures will
       not be invoked. */
    if (MPIDI_Process.my_pg) {
	MPIDI_PG_Destroy(MPIDI_Process.my_pg);
    } 
    MPIDI_Process.my_pg = NULL;

    /* ifdefing out this check because the list will not be NULL in 
       Ch3_finalize because
       one additional reference is retained in MPIDI_Process.my_pg. 
       That reference is released
       only after ch3_finalize returns. If I release it before ch3_finalize, 
       the ssm channel crashes. */
#if 0
    if (MPIDI_PG_list != NULL)
    { 
	
	/* --BEGIN ERROR HANDLING-- */
	mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_INTERN,
        "**dev|pg_finalize|list_not_empty", NULL); 
	/* --END ERROR HANDLING-- */
    }
#endif

    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_PG_FINALIZE);
    return mpi_errno;
}
Ejemplo n.º 9
0
int MPIDI_PG_Destroy(MPIDI_PG_t * pg)
{
    MPIDI_PG_t * pg_prev;
    MPIDI_PG_t * pg_cur;
    int i;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_PG_DESTROY);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_PG_DESTROY);

    MPIR_Assert(MPIR_Object_get_ref(pg) == 0);

    pg_prev = NULL;
    pg_cur = MPIDI_PG_list;
    while(pg_cur != NULL)
    {
	if (pg_cur == pg)
	{
	    if (MPIDI_PG_iterator_next == pg)
	    { 
		MPIDI_PG_iterator_next = MPIDI_PG_iterator_next->next;
	    }

            if (pg_prev == NULL)
                MPIDI_PG_list = pg->next; 
            else
                pg_prev->next = pg->next;

            MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_DISCONNECT, VERBOSE, (MPL_DBG_FDEST, "destroying pg=%p pg->id=%s", pg, (char *)pg->id));

            for (i = 0; i < pg->size; ++i) {
                /* FIXME it would be good if we could make this assertion.
                   Unfortunately, either:
                   1) We're not being disciplined and some caller of this
                      function doesn't bother to manage all the refcounts
                      because he thinks he knows better.  Annoying, but not
                      strictly a bug.
		      (wdg - actually, that is a bug - managing the ref
		      counts IS required and missing one is a bug.)
                   2) There is a real bug lurking out there somewhere and we
                      just haven't hit it in the tests yet.  */
                /*MPIR_Assert(MPIR_Object_get_ref(pg->vct[i]) == 0);*/

                MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_DISCONNECT, VERBOSE, (MPL_DBG_FDEST, "about to free pg->vct=%p which contains vc=%p", pg->vct, &pg->vct[i]));

                /* This used to be handled in MPIDI_VCRT_Release, but that was
                   not the right place to do this.  The VC should only be freed
                   when the PG that it belongs to is freed, not just when the
                   VC's refcount drops to zero. [goodell@ 2008-06-13] */
		/* In that case, the fact that the VC is in the PG should
		   increment the ref count - reflecting the fact that the
		   use in the PG constitutes a reference-count-incrementing
		   use.  Alternately, if the PG is able to recreate a VC, 
		   and can thus free unused (or idle) VCs, it should be allowed
		   to do so.  [wdg 2008-08-31] */
                mpi_errno = MPIDI_CH3_VC_Destroy(&(pg->vct[i]));
                if (mpi_errno) { MPIR_ERR_POP(mpi_errno); }
            }

	    MPIDI_PG_Destroy_fn(pg);
	    MPL_free(pg->vct);
	    if (pg->connData) {
		if (pg->freeConnInfo) {
		    (*pg->freeConnInfo)( pg );
		}
		else {
		    MPL_free(pg->connData);
		}
	    }
	    mpi_errno = MPIDI_CH3_PG_Destroy(pg);
	    MPL_free(pg);

	    goto fn_exit;
	}

	pg_prev = pg_cur;
	pg_cur = pg_cur->next;
    }

    /* PG not found if we got here */
    MPIR_ERR_SET1(mpi_errno,MPI_ERR_OTHER,
		  "**dev|pg_not_found", "**dev|pg_not_found %p", pg);

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_PG_DESTROY);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Ejemplo n.º 10
0
/*@
   MPI_Win_lock - Begin an RMA access epoch at the target process.

Input Parameters:
+ lock_type - Indicates whether other processes may access the target 
   window at the same time (if 'MPI_LOCK_SHARED') or not ('MPI_LOCK_EXCLUSIVE')
. rank - rank of locked window (nonnegative integer) 
. assert - Used to optimize this call; zero may be used as a default.
  See notes. (integer) 
- win - window object (handle) 

   Notes:

   The name of this routine is misleading.  In particular, this
   routine need not block, except when the target process is the calling 
   process.  

   Implementations may restrict the use of RMA communication that is 
   synchronized
   by lock calls to windows in memory allocated by 'MPI_Alloc_mem'. Locks can 
   be used portably only in such memory. 

   The 'assert' argument is used to indicate special conditions for the
   fence that an implementation may use to optimize the 'MPI_Win_lock' 
   operation.  The value zero is always correct.  Other assertion values
   may be or''ed together.  Assertions that are valid for 'MPI_Win_lock' are\:

. MPI_MODE_NOCHECK - no other process holds, or will attempt to acquire a 
  conflicting lock, while the caller holds the window lock. This is useful 
  when mutual exclusion is achieved by other means, but the coherence 
  operations that may be attached to the lock and unlock calls are still 
  required. 

.N ThreadSafe

.N Fortran

.N Errors
.N MPI_SUCCESS
.N MPI_ERR_RANK
.N MPI_ERR_WIN
.N MPI_ERR_OTHER
@*/
int MPI_Win_lock(int lock_type, int rank, int assert, MPI_Win win)
{
    static const char FCNAME[] = "MPI_Win_lock";
    int mpi_errno = MPI_SUCCESS;
    MPIR_Win *win_ptr = NULL;
    MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_WIN_LOCK);

    MPIR_ERRTEST_INITIALIZED_ORDIE();
    
    MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPI_WIN_LOCK);

    /* Validate parameters, especially handles needing to be converted */
#   ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
	    MPIR_ERRTEST_WIN(win, mpi_errno);
        }
        MPID_END_ERROR_CHECKS;
    }
#   endif
    
    /* Convert MPI object handles to object pointers */
    MPIR_Win_get_ptr( win, win_ptr );

    /* Validate parameters and objects (post conversion) */
#   ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
            MPIR_Comm *comm_ptr;

            /* Validate win_ptr */
            MPIR_Win_valid_ptr( win_ptr, mpi_errno );
            /* If win_ptr is not valid, it will be reset to null */
            if (mpi_errno) goto fn_fail;
	    
	    if (assert != 0 && assert != MPI_MODE_NOCHECK) {
		MPIR_ERR_SET1(mpi_errno,MPI_ERR_ARG,
			      "**lockassertval", 
			      "**lockassertval %d", assert );
		if (mpi_errno) goto fn_fail;
	    }
            if (lock_type != MPI_LOCK_SHARED && 
		lock_type != MPI_LOCK_EXCLUSIVE) {
		MPIR_ERR_SET(mpi_errno,MPI_ERR_OTHER, "**locktype" );
                if (mpi_errno) goto fn_fail;
	    }

	    comm_ptr = win_ptr->comm_ptr;
            MPIR_ERRTEST_SEND_RANK(comm_ptr, rank, mpi_errno);

            /* TODO: Test if window is unlocked */

            /* TODO: Validate that window is not in active mode */
        }
        MPID_END_ERROR_CHECKS;
    }
#   endif /* HAVE_ERROR_CHECKING */

    /* ... body of routine ...  */
    
    mpi_errno = MPID_Win_lock(lock_type, rank, assert, win_ptr);
    if (mpi_errno != MPI_SUCCESS) goto fn_fail;

    /* ... end of body of routine ... */

  fn_exit:
    MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPI_WIN_LOCK);
    MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    return mpi_errno;

  fn_fail:
    /* --BEGIN ERROR HANDLING-- */
#   ifdef HAVE_ERROR_CHECKING
    {
	mpi_errno = MPIR_Err_create_code(
	    mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_win_lock", 
	    "**mpi_win_lock %d %d %A %W", lock_type, rank, assert, win);
    }
#   endif
    mpi_errno = MPIR_Err_return_win( win_ptr, FCNAME, mpi_errno );
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Ejemplo n.º 11
0
int MPIDI_CH3U_Check_for_failed_procs(void)
{
    int mpi_errno = MPI_SUCCESS;
    int pmi_errno;
    int len;
    char *kvsname;
    MPIR_Group *prev_failed_group, *new_failed_group;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);

    /* FIXME: Currently this only handles failed processes in
       comm_world.  We need to fix hydra to include the pgid along
       with the rank, then we need to create the failed group from
       something bigger than comm_world. */
    mpi_errno = MPIDI_PG_GetConnKVSname(&kvsname);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);
#ifdef USE_PMI2_API
    {
        int vallen = 0;
        pmi_errno = PMI2_KVS_Get(kvsname, PMI2_ID_NULL, "PMI_dead_processes", MPIDI_failed_procs_string, PMI2_MAX_VALLEN, &vallen);
        MPIR_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get");
    }
#else
    pmi_errno = PMI_KVS_Get_value_length_max(&len);
    MPIR_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get_value_length_max");
    pmi_errno = PMI_KVS_Get(kvsname, "PMI_dead_processes", MPIDI_failed_procs_string, len);
    MPIR_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get");
#endif

    if (*MPIDI_failed_procs_string == '\0') {
        /* there are no failed processes */
        MPIDI_Failed_procs_group = MPIR_Group_empty;
        goto fn_exit;
    }

    MPL_DBG_MSG_S(MPIDI_CH3_DBG_OTHER, TYPICAL, "Received proc fail notification: %s", MPIDI_failed_procs_string);

    /* save reference to previous group so we can identify new failures */
    prev_failed_group = MPIDI_Failed_procs_group;

    /* Parse the list of failed processes */
    MPIDI_CH3U_Get_failed_group(-2, &MPIDI_Failed_procs_group);

    /* get group of newly failed processes */
    mpi_errno = MPIR_Group_difference_impl(MPIDI_Failed_procs_group, prev_failed_group, &new_failed_group);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

    if (new_failed_group != MPIR_Group_empty) {
        mpi_errno = MPIDI_CH3I_Comm_handle_failed_procs(new_failed_group);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);

        mpi_errno = terminate_failed_VCs(new_failed_group);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIR_Group_release(new_failed_group);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    }

    /* free prev group */
    if (prev_failed_group != MPIR_Group_empty) {
        mpi_errno = MPIR_Group_release(prev_failed_group);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    }

 fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);
    return mpi_errno;

 fn_oom: /* out-of-memory handler for utarray operations */
    MPIR_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "utarray");
 fn_fail:
    goto fn_exit;
}
Ejemplo n.º 12
0
int MPIDI_CH3U_Get_failed_group(int last_rank, MPIR_Group **failed_group)
{
    char *c;
    int i, mpi_errno = MPI_SUCCESS, rank;
    UT_array *failed_procs = NULL;
    MPIR_Group *world_group;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_GET_FAILED_GROUP);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_GET_FAILED_GROUP);

    MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER, VERBOSE, "Getting failed group with %d as last acknowledged\n", last_rank);

    if (-1 == last_rank) {
        MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER, VERBOSE, "No failure acknowledged");
        *failed_group = MPIR_Group_empty;
        goto fn_exit;
    }

    if (*MPIDI_failed_procs_string == '\0') {
        MPL_DBG_MSG(MPIDI_CH3_DBG_OTHER, VERBOSE, "Found no failed ranks");
        *failed_group = MPIR_Group_empty;
        goto fn_exit;
    }

    utarray_new(failed_procs, &ut_int_icd);

    /* parse list of failed processes.  This is a comma separated list
       of ranks or ranges of ranks (e.g., "1, 3-5, 11") */
    i = 0;
    c = MPIDI_failed_procs_string;
    while(1) {
        parse_rank(&rank);
        ++i;
        MPL_DBG_MSG_D(MPIDI_CH3_DBG_OTHER, VERBOSE, "Found failed rank: %d", rank);
        utarray_push_back(failed_procs, &rank);
        MPIDI_last_known_failed = rank;
        MPIR_ERR_CHKINTERNAL(*c != ',' && *c != '\0', mpi_errno, "error parsing failed process list");
        if (*c == '\0' || last_rank == rank)
            break;
        ++c; /* skip ',' */
    }

    /* Create group of failed processes for comm_world.  Failed groups for other
       communicators can be created from this one using group_intersection. */
    mpi_errno = MPIR_Comm_group_impl(MPIR_Process.comm_world, &world_group);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

    mpi_errno = MPIR_Group_incl_impl(world_group, i, ut_int_array(failed_procs), failed_group);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

    mpi_errno = MPIR_Group_release(world_group);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_GET_FAILED_GROUP);
    if (failed_procs)
        utarray_free(failed_procs);
    return mpi_errno;
fn_oom:
    MPIR_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "utarray");
fn_fail:
    goto fn_exit;
}