Пример #1
0
int MPIDI_CH3_Abort(int exit_code, char *error_msg)
{
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_ABORT);
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_ABORT);

    /* print backtrace */
    if (show_backtrace) print_backtrace();
    
    PMI_Abort(exit_code, error_msg);

    /* if abort returns for some reason, exit here */

    MPIU_Error_printf("%s", error_msg);
    fflush(stderr);

    exit(exit_code);
#if defined(__SUNPRO_C) || defined(__SUNPRO_CC)
#pragma error_messages(off, E_STATEMENT_NOT_REACHED)
#endif /* defined(__SUNPRO_C) || defined(__SUNPRO_CC) */
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_ABORT);
    return MPI_ERR_INTERN;
#if defined(__SUNPRO_C) || defined(__SUNPRO_CC)
#pragma error_messages(default, E_STATEMENT_NOT_REACHED)
#endif /* defined(__SUNPRO_C) || defined(__SUNPRO_CC) */
}
Пример #2
0
int main(int argc, char **argv, char **envp)
{
    int pmi_rank = -1;
    int pmi_process_group_size = -1;
    int rc = EXIT_SUCCESS;
    char *err = NULL;
    PMI_BOOL pmi_initialized = PMI_FALSE;
    int i;
    double pi;
    int spawned;

    if (1 < argc) {
        rc = strtol(argv[1], NULL, 10);
    } else {
        rc = 3;
    }

    /* sanity */
    if (PMI_SUCCESS != PMI_Initialized(&pmi_initialized) ||
        PMI_TRUE == pmi_initialized) {
        fprintf(stderr, "=== ERROR: PMI sanity failure\n");
        return EXIT_FAILURE;
    }
    if (PMI_SUCCESS != PMI_Init(&spawned)) {
        err = "PMI_Init failure!";
        goto done;
    }
    if (PMI_SUCCESS != PMI_Get_size(&pmi_process_group_size)) {
        err = "PMI_Get_size failure!";
        goto done;
    }
    if (PMI_SUCCESS != PMI_Get_rank(&pmi_rank)) {
        err = "PMI_Get_rank failure!";
        goto done;
    }
 
    i = 0;
    while (1) {
        i++;
        pi = i / 3.14159256;
        if (i > 10000) i = 0;
        if ((pmi_rank == 3 || 
             (pmi_process_group_size <= 3 && pmi_rank == 0))
            && i == 9995) {
            asprintf(&err, "RANK%d CALLED ABORT", pmi_rank);
            fprintf(stderr, "%s\n", err);
            fflush(stderr);
            PMI_Abort(rc, err);
        }
    }

 done:
    if (NULL != err) {
        fprintf(stderr, "=== ERROR [rank:%d] %s\n", pmi_rank, err);
        rc = EXIT_FAILURE;
    }
    return rc;
}
Пример #3
0
/* gasneti_bootstrapAbort
 */
void gasneti_bootstrapAbort_pmi(int exitcode) {
#if USE_PMI2_API
    PMI2_Abort(1, "GASNet abnormal exit");
#else
    PMI_Abort(exitcode, "GASNet abnormal exit");
#endif
    gasneti_fatalerror("PMI_Abort failed.");
    /* NOT REACHED */
}
Пример #4
0
void mca_common_pmi_abort(int status, char *msg)
{
#if WANT_PMI2_SUPPORT
    if( mca_common_pmi_version == 2){
        PMI2_Abort(status, msg);
    }
    else
#endif
    {
        PMI_Abort(status, msg);
    }
}
Пример #5
0
int MPID_Abort(MPIR_Comm * comm, int mpi_errno, int exit_code, const char *error_msg)
{
    char sys_str[MPI_MAX_ERROR_STRING + 5] = "";
    char comm_str[MPI_MAX_ERROR_STRING] = "";
    char world_str[MPI_MAX_ERROR_STRING] = "";
    char error_str[2 * MPI_MAX_ERROR_STRING + 128];
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_ABORT);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_ABORT);

    if (MPIR_Process.comm_world) {
        int rank = MPIR_Process.comm_world->rank;
        snprintf(world_str, sizeof(world_str), " on node %d", rank);
    }

    if (comm) {
        int rank = comm->rank;
        int context_id = comm->context_id;
        snprintf(comm_str, sizeof(comm_str), " (rank %d in comm %d)", rank, context_id);
    }

    if (!error_msg)
        error_msg = "Internal error";

    if (mpi_errno != MPI_SUCCESS) {
        char msg[MPI_MAX_ERROR_STRING] = "";
        MPIR_Err_get_string(mpi_errno, msg, MPI_MAX_ERROR_STRING, NULL);
        snprintf(sys_str, sizeof(msg), " (%s)", msg);
    }
    MPL_snprintf(error_str, sizeof(error_str), "Abort(%d)%s%s: %s%s\n",
                 exit_code, world_str, comm_str, error_msg, sys_str);
    MPL_error_printf("%s", error_str);

    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_ABORT);
    fflush(stderr);
    fflush(stdout);
    if (NULL == comm || (MPIR_Comm_size(comm) == 1 && comm->comm_kind == MPIR_COMM_KIND__INTRACOMM))
        MPL_exit(exit_code);

    if (comm != MPIR_Process.comm_world) {
        MPIDIG_comm_abort(comm, exit_code);
    } else {
#ifdef USE_PMIX_API
        PMIx_Abort(exit_code, error_msg, NULL, 0);
#elif defined(USE_PMI2_API)
        PMI2_Abort(TRUE, error_msg);
#else
        PMI_Abort(exit_code, error_msg);
#endif
    }
    return 0;
}
Пример #6
0
void
ompi_rte_abort(int error_code, char *fmt, ...)
{
    char *msg;
    int ret;
    va_list ap;

    va_start(ap, fmt);

    ret = vasprintf(&msg, fmt, ap);
    if (-1 == ret) msg = "";

    va_end(ap);

    PMI_Abort(error_code, msg);
}
Пример #7
0
RTE_PUBLIC void rte_pmi_abort(int error_code, char *exit_description, ...)
{
    int rc;

    va_list arglist;
    char* buffer = NULL;

    /* If there was a message, output it */
    va_start(arglist, exit_description);
    if( NULL != exit_description ) {
        vasprintf( &buffer, exit_description, arglist );
    }

    rc = PMI_Abort (error_code, buffer);
    if (PMI_SUCCESS == rc) {
        free( buffer );
        return;
    }

    /* we might want to put some warning here */

    free( buffer );
    return;
}
Пример #8
0
static int s1_abort(int flag, const char msg[],
                    opal_list_t *procs)
{
    PMI_Abort(flag, msg);
    return OPAL_SUCCESS;
}
Пример #9
0
int main(void)
{
  int rc;
  int rank, size;

  PMI_BOOL initialized;
  rc = PMI_Initialized(&initialized);
  if (rc!=PMI_SUCCESS)
    PMI_Abort(rc,"PMI_Initialized failed");

  if (initialized!=PMI_TRUE)
  {
    int spawned;
    rc = PMI_Init(&spawned);
    if (rc!=PMI_SUCCESS)
      PMI_Abort(rc,"PMI_Init failed");
  }

  rc = PMI_Get_rank(&rank);
  if (rc!=PMI_SUCCESS)
    PMI_Abort(rc,"PMI_Get_rank failed");

  rc = PMI_Get_size(&size);
  if (rc!=PMI_SUCCESS)
    PMI_Abort(rc,"PMI_Get_size failed");

  printf("rank %d of %d \n", rank, size);

  int rpn; /* rpn = ranks per node */
  rc = PMI_Get_clique_size(&rpn);
  if (rc!=PMI_SUCCESS)
    PMI_Abort(rc,"PMI_Get_clique_size failed");
  printf("rank %d clique size %d \n", rank, rpn);

  int * clique_ranks = malloc( rpn * sizeof(int) );
  if (clique_ranks==NULL)
    PMI_Abort(rpn,"malloc failed");

   rc = PMI_Get_clique_ranks(clique_ranks, rpn);
  if (rc!=PMI_SUCCESS)
    PMI_Abort(rc,"PMI_Get_clique_ranks failed");
  for(int i = 0; i<rpn; i++)
    printf("rank %d clique[%d] = %d \n", rank, i, clique_ranks[i]);

  int nid;
  rc = PMI_Get_nid(rank, &nid);
  if (rc!=PMI_SUCCESS)
    PMI_Abort(rc,"PMI_Get_nid failed");
  printf("rank %d PMI_Get_nid gives nid %d \n", rank, nid);

#if OLD
  rca_mesh_coord_t xyz;
  rca_get_meshcoord( (uint16_t) nid, &xyz);
  printf("rank %d rca_get_meshcoord returns (%2u,%2u,%2u)\n", 
         rank, xyz.mesh_x, xyz.mesh_y, xyz.mesh_z);
#else // UNTESTED
  pmi_mesh_coord_t xyz;
  PMI_Get_meshcoord((uint16_t) nid, &xyz);
  printf("rank %d PMI_Get_meshcoord returns (%2u,%2u,%2u)\n", 
         rank, xyz.mesh_x, xyz.mesh_y, xyz.mesh_z);
#endif

  fflush(stdout);
  return 0;
}
Пример #10
0
int pmgr_init(int *argc_p, char ***argv_p, int *np_p, int *me_p, int *id_p)
{
    setvbuf(stdout, NULL, _IONBF, 0);
    char *value;

    struct timeval start, end;
    pmgr_gettimeofday(&start);

    pmgr_echo_debug = 0;

    pmgr_tree_init_null(&pmgr_tree_all);

    /* =======================================================
     * Until told otherwise, assume we are rank 0 of a 1-task MPI job
     * this enables serial launching, e.g., "./mpiHello" vs "mpirun -np 1 ./mpiHello"
     * TODO: may want to protect this usage via a compile flag and/or env var
     * ======================================================= */

    /* Take a stab at something unique for the id (timestamp.secs | pid)
     * TODO: !!NOTE!!
     * Using a pid in the jobid *ONLY* works for a single process job
     * Obviously, multiple tasks will have different pids */
    pmgr_id = 0x7FFFFFFF & ((start.tv_sec << 16) | (0x0000FFFF & getpid()));

    pmgr_me     = 0;
    pmgr_nprocs = 1;

    mpirun_hostname = NULL;
    mpirun_port = 1;

    /* =======================================================
     * Get information from environment, not from the argument list
     * ======================================================= */

    /* if MPIRUN_RANK is set, require RANK, NPROCS, ID, HOST, and PORT to all be set */
    /* this ensures that if one process aborts in a multitask job,
     * then something is there to abort the others, namely the mpirun process */
    if ((value = pmgr_getenv("MPIRUN_RANK", ENV_OPTIONAL)) != NULL) {
        /* MPI rank of current process */
        pmgr_me = atoi(pmgr_getenv("MPIRUN_RANK", ENV_REQUIRED));

        /* number of MPI processes in job */
        pmgr_nprocs = atoi(pmgr_getenv("MPIRUN_NPROCS", ENV_REQUIRED));

        /* unique jobid of current application */
        pmgr_id = atoi(pmgr_getenv("MPIRUN_ID", ENV_REQUIRED));

        /* mpirun host IP string in dotted decimal notation */
        mpirun_hostname = strdup(pmgr_getenv("MPIRUN_HOST", ENV_REQUIRED));

        /* mpirun port number */
        mpirun_port = atoi(pmgr_getenv("MPIRUN_PORT", ENV_REQUIRED));
    }

    if ((value = pmgr_getenv("MPIRUN_OPEN_TIMEOUT", ENV_OPTIONAL))) {
        mpirun_open_timeout = atoi(value);
    }

    if ((value = pmgr_getenv("MPIRUN_CONNECT_TRIES", ENV_OPTIONAL))) {
        mpirun_connect_tries = atoi(value);
    }

    /* seconds */
    if ((value = pmgr_getenv("MPIRUN_CONNECT_TIMEOUT", ENV_OPTIONAL))) {
        mpirun_connect_timeout = atoi(value);
    }

    /* seconds */
    if ((value = pmgr_getenv("MPIRUN_CONNECT_BACKOFF", ENV_OPTIONAL))) {
        mpirun_connect_backoff = atoi(value);
    }

    /* enable/disable radomized option in backoff */
    if ((value = pmgr_getenv("MPIRUN_CONNECT_RANDOM", ENV_OPTIONAL))) {
        mpirun_connect_random = atoi(value);
    }

    /* whether to connect tree from parent to children (down) or children to parent (up) */
    if ((value = pmgr_getenv("MPIRUN_CONNECT_DOWN", ENV_OPTIONAL))) {
        mpirun_connect_down = atoi(value);
    }

    /* MPIRUN_USE_TREES={0,1} disables/enables tree algorithms */
    if ((value = pmgr_getenv("MPIRUN_USE_TREES", ENV_OPTIONAL))) {
        mpirun_use_trees = atoi(value);
    }

    /* use pmi instead of socket connections to mpirun */
    if ((value = pmgr_getenv("MPIRUN_PMI_ENABLE", ENV_OPTIONAL))) {
#ifdef HAVE_PMI
        mpirun_pmi_enable = atoi(value);
#else /* ifdef HAVE_PMI */
        /* PMI was not compiled in, warn user that we're ignoring this value */
        if (pmgr_me == 0) {
            pmgr_error("Not built with PMI support, ignoring MPIRUN_USE_PMI @ %s:%d",
                __FILE__, __LINE__
            );
        }
#endif /* ifdef HAVE_PMI */
    }

    /* whether to use /dev/shm to start jobs */
    if ((value = pmgr_getenv("MPIRUN_SHM_ENABLE", ENV_OPTIONAL))) {
        mpirun_shm_enable = atoi(value);
    }

    /* minimum number of tasks to switch to /dev/shm */
    if ((value = pmgr_getenv("MPIRUN_SHM_THRESHOLD", ENV_OPTIONAL))) {
        mpirun_shm_threshold = atoi(value);
    }

    /* whether to authenticate connections */
    if ((value = pmgr_getenv("MPIRUN_AUTHENTICATE_ENABLE", ENV_OPTIONAL))) {
        mpirun_authenticate_enable = atoi(value);
    }

    /* time to wait for a reply when authenticating a new connection (miilisecs) */
    if ((value = pmgr_getenv("MPIRUN_AUTHENTICATE_TIMEOUT", ENV_OPTIONAL))) {
        mpirun_authenticate_timeout = atoi(value);
    }

    /* total time to attempt to connect to a host before aborting (seconds) */
    if ((value = pmgr_getenv("MPIRUN_PORT_SCAN_TIMEOUT", ENV_OPTIONAL))) {
        mpirun_port_scan_timeout = atoi(value);
    }

    /* time to wait on connect call before giving up (millisecs) */
    if ((value = pmgr_getenv("MPIRUN_PORT_SCAN_CONNECT_TIMEOUT", ENV_OPTIONAL))) {
        mpirun_port_scan_connect_timeout = atoi(value);
    }

    /* number of times to attempt connect call to given IP:port */
    if ((value = pmgr_getenv("MPIRUN_PORT_SCAN_CONNECT_ATTEMPTS", ENV_OPTIONAL))) {
        mpirun_port_scan_connect_attempts = atoi(value);
    }

    /* time to wait between making consecutive connect attempts to a given IP:port (millisecs) */
    if ((value = pmgr_getenv("MPIRUN_PORT_SCAN_CONNECT_SLEEP", ENV_OPTIONAL))) {
        mpirun_port_scan_connect_sleep = atoi(value);
    }

    /* initialize PMI library if we're using it, and get rank, ranks, and jobid from PMI */
    if (mpirun_pmi_enable) {
#ifdef HAVE_PMI
        /* initialize the PMI library */
        int spawned = 0;
        if (PMI_Init(&spawned) != PMI_SUCCESS) {
            pmgr_error("Failed to initialize PMI library @ file %s:%d",
                __FILE__, __LINE__
            );
            PMI_Abort(1, "Failed to initialize PMI library");
        }
        if (spawned) {
            pmgr_error("Spawned processes not supported @ file %s:%d",
                __FILE__, __LINE__
            );
            PMI_Abort(1, "Spawned processes not supported");
        }

        /* get my rank */
        if (PMI_Get_rank(&pmgr_me) != PMI_SUCCESS) {
            pmgr_error("Getting rank @ file %s:%d",
                __FILE__, __LINE__
            );
            PMI_Abort(1, "Failed to get rank from PMI");
        }

        /* get the number of ranks in this job */
        if (PMI_Get_size(&pmgr_nprocs) != PMI_SUCCESS) {
            pmgr_error("Getting number of ranks in job @ file %s:%d",
                __FILE__, __LINE__
            );
            PMI_Abort(1, "Failed to get number of ranks in job");
        }

        /* get jobid */
        if (PMI_Get_appnum(&pmgr_id) != PMI_SUCCESS) {
            pmgr_error("Getting job id @ file %s:%d",
                __FILE__, __LINE__
            );
            PMI_Abort(1, "Failed to get job id from PMI");
        }
#endif /* ifdef HAVE_PMI */
    }

    /* =======================================================
     * Check that we have valid values
     * ======================================================= */

    /* MPIRUN_CLIENT_DEBUG={0,1} disables/enables debug statements */
    /* this comes *after* MPIRUN_RANK and MPIRUN_NPROCS since those are used to print debug messages */
    if ((value = pmgr_getenv("MPIRUN_CLIENT_DEBUG", ENV_OPTIONAL)) != NULL) {
        pmgr_echo_debug = atoi(value);
        int print_rank = 0;
        if (pmgr_echo_debug > 0) {
            if        (pmgr_echo_debug <= 1*PMGR_DEBUG_LEVELS) {
                print_rank = (pmgr_me == 0); /* just rank 0 prints */
            } else if (pmgr_echo_debug <= 2*PMGR_DEBUG_LEVELS) {
                print_rank = (pmgr_me == 0 || pmgr_me == pmgr_nprocs-1); /* just rank 0 and rank N-1 print */
            } else {
                print_rank = 1; /* all ranks print */
            }
            if (print_rank) {
                pmgr_echo_debug = 1 + (pmgr_echo_debug-1) % PMGR_DEBUG_LEVELS;
            } else {
                pmgr_echo_debug = 0;
            }
        }
    }

    /* check that we have a valid number of processes */
    if (pmgr_nprocs <= 0) {
        pmgr_error("Invalid NPROCS %s @ file %s:%d",
            pmgr_nprocs, __FILE__, __LINE__
        );
        exit(1);
    }

    /* check that our rank is valid */
    if (pmgr_me < 0 || pmgr_me >= pmgr_nprocs) {
        pmgr_error("Invalid RANK %s @ file %s:%d",
            pmgr_me, __FILE__, __LINE__
        );
        exit(1);
    }

    /* check that we have a valid jobid */
    if (pmgr_id == 0) {
        pmgr_error("Invalid JOBID %s @ file %s:%d",
            pmgr_id, __FILE__, __LINE__
        );
        exit(1);
    }

    /* set parameters */
    *np_p = pmgr_nprocs;
    *me_p = pmgr_me;
    *id_p = pmgr_id;

    pmgr_gettimeofday(&end);
    pmgr_debug(2, "Exiting pmgr_init(), took %f seconds for %d procs", pmgr_getsecs(&end,&start), pmgr_nprocs);
    return PMGR_SUCCESS;
}
Пример #11
0
/*
 * Call into the process spawner, using the same port we were given
 * at startup time, to tell it to abort the entire job.
 */
int pmgr_abort(int code, const char *fmt, ...)
{
    int s;
    struct sockaddr_in sin;
    struct hostent* he;
    va_list ap;
    char buf [256];
    int len;

    /* if the tree is open, send out abort messages to parent and children */
    pmgr_abort_trees();

    /* build our error message */
    va_start(ap, fmt);
    vprint_msg(buf, sizeof(buf), fmt, ap);
    va_end(ap);

    /* check whether we have an mpirun process, and check whether we can connect back to it */
    if (mpirun_hostname != NULL && !mpirun_pmi_enable && !(mpirun_shm_enable && pmgr_nprocs >= mpirun_shm_threshold)) {
        he = gethostbyname(mpirun_hostname);
        if (!he) {
            pmgr_error("pmgr_abort: Hostname lookup of mpirun failed (gethostbyname(%s) %s h_errno=%d) @ file %s:%d",
                mpirun_hostname, hstrerror(h_errno), h_errno, __FILE__, __LINE__
            );
            return -1;
        }

        s = socket(AF_INET, SOCK_STREAM, 0);
        if (s < 0) {
            pmgr_error("pmgr_abort: Failed to create socket (socket() %m errno=%d) @ file %s:%d",
                errno, __FILE__, __LINE__
            );
            return -1;
        }

        memset(&sin, 0, sizeof(sin));
        sin.sin_family = he->h_addrtype;
        memcpy(&sin.sin_addr, he->h_addr_list[0], sizeof(sin.sin_addr));
        sin.sin_port = htons(mpirun_port);
        if (connect(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) {
            pmgr_error("pmgr_abort: Connect to mpirun failed (connect() %m errno=%d) @ file %s:%d",
                errno, __FILE__, __LINE__
            );
            return -1;
        }

        /* write an abort code (may be destination rank), our rank to mpirun */
        pmgr_write_fd(s, &code, sizeof(code));
        pmgr_write_fd(s, &pmgr_me, sizeof(pmgr_me));

        /* now length of error string, and error string itself to mpirun */
        len = strlen(buf) + 1;
        pmgr_write_fd(s, &len, sizeof(len));
        pmgr_write_fd(s, buf, len);

        close(s);
    } else { /* check that (mpirun_hostname != NULL) */
        /* TODO: want to echo this message here?  Want to do this for every user abort? */
        pmgr_error("Called pmgr_abort() Code: %d, Msg: %s", code, buf);
    }

    if (mpirun_pmi_enable) {
#ifdef HAVE_PMI
        PMI_Abort(code, buf);
#endif /* ifdef HAVE_PMI */
    }

    return PMGR_SUCCESS;
}
static int s1_abort(int flag, const char msg[])
{
    PMI_Abort(flag, msg);
    return OPAL_SUCCESS;
}
Пример #13
0
int
ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs)
{
    PMI_Abort(1, "");
    return OMPI_SUCCESS;
}