int MPIDI_CH3_Abort(int exit_code, char *error_msg) { MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_ABORT); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_ABORT); /* print backtrace */ if (show_backtrace) print_backtrace(); PMI_Abort(exit_code, error_msg); /* if abort returns for some reason, exit here */ MPIU_Error_printf("%s", error_msg); fflush(stderr); exit(exit_code); #if defined(__SUNPRO_C) || defined(__SUNPRO_CC) #pragma error_messages(off, E_STATEMENT_NOT_REACHED) #endif /* defined(__SUNPRO_C) || defined(__SUNPRO_CC) */ MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_ABORT); return MPI_ERR_INTERN; #if defined(__SUNPRO_C) || defined(__SUNPRO_CC) #pragma error_messages(default, E_STATEMENT_NOT_REACHED) #endif /* defined(__SUNPRO_C) || defined(__SUNPRO_CC) */ }
int main(int argc, char **argv, char **envp) { int pmi_rank = -1; int pmi_process_group_size = -1; int rc = EXIT_SUCCESS; char *err = NULL; PMI_BOOL pmi_initialized = PMI_FALSE; int i; double pi; int spawned; if (1 < argc) { rc = strtol(argv[1], NULL, 10); } else { rc = 3; } /* sanity */ if (PMI_SUCCESS != PMI_Initialized(&pmi_initialized) || PMI_TRUE == pmi_initialized) { fprintf(stderr, "=== ERROR: PMI sanity failure\n"); return EXIT_FAILURE; } if (PMI_SUCCESS != PMI_Init(&spawned)) { err = "PMI_Init failure!"; goto done; } if (PMI_SUCCESS != PMI_Get_size(&pmi_process_group_size)) { err = "PMI_Get_size failure!"; goto done; } if (PMI_SUCCESS != PMI_Get_rank(&pmi_rank)) { err = "PMI_Get_rank failure!"; goto done; } i = 0; while (1) { i++; pi = i / 3.14159256; if (i > 10000) i = 0; if ((pmi_rank == 3 || (pmi_process_group_size <= 3 && pmi_rank == 0)) && i == 9995) { asprintf(&err, "RANK%d CALLED ABORT", pmi_rank); fprintf(stderr, "%s\n", err); fflush(stderr); PMI_Abort(rc, err); } } done: if (NULL != err) { fprintf(stderr, "=== ERROR [rank:%d] %s\n", pmi_rank, err); rc = EXIT_FAILURE; } return rc; }
/* gasneti_bootstrapAbort */ void gasneti_bootstrapAbort_pmi(int exitcode) { #if USE_PMI2_API PMI2_Abort(1, "GASNet abnormal exit"); #else PMI_Abort(exitcode, "GASNet abnormal exit"); #endif gasneti_fatalerror("PMI_Abort failed."); /* NOT REACHED */ }
void mca_common_pmi_abort(int status, char *msg) { #if WANT_PMI2_SUPPORT if( mca_common_pmi_version == 2){ PMI2_Abort(status, msg); } else #endif { PMI_Abort(status, msg); } }
int MPID_Abort(MPIR_Comm * comm, int mpi_errno, int exit_code, const char *error_msg) { char sys_str[MPI_MAX_ERROR_STRING + 5] = ""; char comm_str[MPI_MAX_ERROR_STRING] = ""; char world_str[MPI_MAX_ERROR_STRING] = ""; char error_str[2 * MPI_MAX_ERROR_STRING + 128]; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_ABORT); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_ABORT); if (MPIR_Process.comm_world) { int rank = MPIR_Process.comm_world->rank; snprintf(world_str, sizeof(world_str), " on node %d", rank); } if (comm) { int rank = comm->rank; int context_id = comm->context_id; snprintf(comm_str, sizeof(comm_str), " (rank %d in comm %d)", rank, context_id); } if (!error_msg) error_msg = "Internal error"; if (mpi_errno != MPI_SUCCESS) { char msg[MPI_MAX_ERROR_STRING] = ""; MPIR_Err_get_string(mpi_errno, msg, MPI_MAX_ERROR_STRING, NULL); snprintf(sys_str, sizeof(msg), " (%s)", msg); } MPL_snprintf(error_str, sizeof(error_str), "Abort(%d)%s%s: %s%s\n", exit_code, world_str, comm_str, error_msg, sys_str); MPL_error_printf("%s", error_str); MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_ABORT); fflush(stderr); fflush(stdout); if (NULL == comm || (MPIR_Comm_size(comm) == 1 && comm->comm_kind == MPIR_COMM_KIND__INTRACOMM)) MPL_exit(exit_code); if (comm != MPIR_Process.comm_world) { MPIDIG_comm_abort(comm, exit_code); } else { #ifdef USE_PMIX_API PMIx_Abort(exit_code, error_msg, NULL, 0); #elif defined(USE_PMI2_API) PMI2_Abort(TRUE, error_msg); #else PMI_Abort(exit_code, error_msg); #endif } return 0; }
void ompi_rte_abort(int error_code, char *fmt, ...) { char *msg; int ret; va_list ap; va_start(ap, fmt); ret = vasprintf(&msg, fmt, ap); if (-1 == ret) msg = ""; va_end(ap); PMI_Abort(error_code, msg); }
RTE_PUBLIC void rte_pmi_abort(int error_code, char *exit_description, ...) { int rc; va_list arglist; char* buffer = NULL; /* If there was a message, output it */ va_start(arglist, exit_description); if( NULL != exit_description ) { vasprintf( &buffer, exit_description, arglist ); } rc = PMI_Abort (error_code, buffer); if (PMI_SUCCESS == rc) { free( buffer ); return; } /* we might want to put some warning here */ free( buffer ); return; }
static int s1_abort(int flag, const char msg[], opal_list_t *procs) { PMI_Abort(flag, msg); return OPAL_SUCCESS; }
int main(void) { int rc; int rank, size; PMI_BOOL initialized; rc = PMI_Initialized(&initialized); if (rc!=PMI_SUCCESS) PMI_Abort(rc,"PMI_Initialized failed"); if (initialized!=PMI_TRUE) { int spawned; rc = PMI_Init(&spawned); if (rc!=PMI_SUCCESS) PMI_Abort(rc,"PMI_Init failed"); } rc = PMI_Get_rank(&rank); if (rc!=PMI_SUCCESS) PMI_Abort(rc,"PMI_Get_rank failed"); rc = PMI_Get_size(&size); if (rc!=PMI_SUCCESS) PMI_Abort(rc,"PMI_Get_size failed"); printf("rank %d of %d \n", rank, size); int rpn; /* rpn = ranks per node */ rc = PMI_Get_clique_size(&rpn); if (rc!=PMI_SUCCESS) PMI_Abort(rc,"PMI_Get_clique_size failed"); printf("rank %d clique size %d \n", rank, rpn); int * clique_ranks = malloc( rpn * sizeof(int) ); if (clique_ranks==NULL) PMI_Abort(rpn,"malloc failed"); rc = PMI_Get_clique_ranks(clique_ranks, rpn); if (rc!=PMI_SUCCESS) PMI_Abort(rc,"PMI_Get_clique_ranks failed"); for(int i = 0; i<rpn; i++) printf("rank %d clique[%d] = %d \n", rank, i, clique_ranks[i]); int nid; rc = PMI_Get_nid(rank, &nid); if (rc!=PMI_SUCCESS) PMI_Abort(rc,"PMI_Get_nid failed"); printf("rank %d PMI_Get_nid gives nid %d \n", rank, nid); #if OLD rca_mesh_coord_t xyz; rca_get_meshcoord( (uint16_t) nid, &xyz); printf("rank %d rca_get_meshcoord returns (%2u,%2u,%2u)\n", rank, xyz.mesh_x, xyz.mesh_y, xyz.mesh_z); #else // UNTESTED pmi_mesh_coord_t xyz; PMI_Get_meshcoord((uint16_t) nid, &xyz); printf("rank %d PMI_Get_meshcoord returns (%2u,%2u,%2u)\n", rank, xyz.mesh_x, xyz.mesh_y, xyz.mesh_z); #endif fflush(stdout); return 0; }
int pmgr_init(int *argc_p, char ***argv_p, int *np_p, int *me_p, int *id_p) { setvbuf(stdout, NULL, _IONBF, 0); char *value; struct timeval start, end; pmgr_gettimeofday(&start); pmgr_echo_debug = 0; pmgr_tree_init_null(&pmgr_tree_all); /* ======================================================= * Until told otherwise, assume we are rank 0 of a 1-task MPI job * this enables serial launching, e.g., "./mpiHello" vs "mpirun -np 1 ./mpiHello" * TODO: may want to protect this usage via a compile flag and/or env var * ======================================================= */ /* Take a stab at something unique for the id (timestamp.secs | pid) * TODO: !!NOTE!! * Using a pid in the jobid *ONLY* works for a single process job * Obviously, multiple tasks will have different pids */ pmgr_id = 0x7FFFFFFF & ((start.tv_sec << 16) | (0x0000FFFF & getpid())); pmgr_me = 0; pmgr_nprocs = 1; mpirun_hostname = NULL; mpirun_port = 1; /* ======================================================= * Get information from environment, not from the argument list * ======================================================= */ /* if MPIRUN_RANK is set, require RANK, NPROCS, ID, HOST, and PORT to all be set */ /* this ensures that if one process aborts in a multitask job, * then something is there to abort the others, namely the mpirun process */ if ((value = pmgr_getenv("MPIRUN_RANK", ENV_OPTIONAL)) != NULL) { /* MPI rank of current process */ pmgr_me = atoi(pmgr_getenv("MPIRUN_RANK", ENV_REQUIRED)); /* number of MPI processes in job */ pmgr_nprocs = atoi(pmgr_getenv("MPIRUN_NPROCS", ENV_REQUIRED)); /* unique jobid of current application */ pmgr_id = atoi(pmgr_getenv("MPIRUN_ID", ENV_REQUIRED)); /* mpirun host IP string in dotted decimal notation */ mpirun_hostname = strdup(pmgr_getenv("MPIRUN_HOST", ENV_REQUIRED)); /* mpirun port number */ mpirun_port = atoi(pmgr_getenv("MPIRUN_PORT", ENV_REQUIRED)); } if ((value = pmgr_getenv("MPIRUN_OPEN_TIMEOUT", ENV_OPTIONAL))) { mpirun_open_timeout = atoi(value); } if ((value = pmgr_getenv("MPIRUN_CONNECT_TRIES", ENV_OPTIONAL))) { mpirun_connect_tries = atoi(value); } /* seconds */ if ((value = pmgr_getenv("MPIRUN_CONNECT_TIMEOUT", ENV_OPTIONAL))) { mpirun_connect_timeout = atoi(value); } /* seconds */ if ((value = pmgr_getenv("MPIRUN_CONNECT_BACKOFF", ENV_OPTIONAL))) { mpirun_connect_backoff = atoi(value); } /* enable/disable radomized option in backoff */ if ((value = pmgr_getenv("MPIRUN_CONNECT_RANDOM", ENV_OPTIONAL))) { mpirun_connect_random = atoi(value); } /* whether to connect tree from parent to children (down) or children to parent (up) */ if ((value = pmgr_getenv("MPIRUN_CONNECT_DOWN", ENV_OPTIONAL))) { mpirun_connect_down = atoi(value); } /* MPIRUN_USE_TREES={0,1} disables/enables tree algorithms */ if ((value = pmgr_getenv("MPIRUN_USE_TREES", ENV_OPTIONAL))) { mpirun_use_trees = atoi(value); } /* use pmi instead of socket connections to mpirun */ if ((value = pmgr_getenv("MPIRUN_PMI_ENABLE", ENV_OPTIONAL))) { #ifdef HAVE_PMI mpirun_pmi_enable = atoi(value); #else /* ifdef HAVE_PMI */ /* PMI was not compiled in, warn user that we're ignoring this value */ if (pmgr_me == 0) { pmgr_error("Not built with PMI support, ignoring MPIRUN_USE_PMI @ %s:%d", __FILE__, __LINE__ ); } #endif /* ifdef HAVE_PMI */ } /* whether to use /dev/shm to start jobs */ if ((value = pmgr_getenv("MPIRUN_SHM_ENABLE", ENV_OPTIONAL))) { mpirun_shm_enable = atoi(value); } /* minimum number of tasks to switch to /dev/shm */ if ((value = pmgr_getenv("MPIRUN_SHM_THRESHOLD", ENV_OPTIONAL))) { mpirun_shm_threshold = atoi(value); } /* whether to authenticate connections */ if ((value = pmgr_getenv("MPIRUN_AUTHENTICATE_ENABLE", ENV_OPTIONAL))) { mpirun_authenticate_enable = atoi(value); } /* time to wait for a reply when authenticating a new connection (miilisecs) */ if ((value = pmgr_getenv("MPIRUN_AUTHENTICATE_TIMEOUT", ENV_OPTIONAL))) { mpirun_authenticate_timeout = atoi(value); } /* total time to attempt to connect to a host before aborting (seconds) */ if ((value = pmgr_getenv("MPIRUN_PORT_SCAN_TIMEOUT", ENV_OPTIONAL))) { mpirun_port_scan_timeout = atoi(value); } /* time to wait on connect call before giving up (millisecs) */ if ((value = pmgr_getenv("MPIRUN_PORT_SCAN_CONNECT_TIMEOUT", ENV_OPTIONAL))) { mpirun_port_scan_connect_timeout = atoi(value); } /* number of times to attempt connect call to given IP:port */ if ((value = pmgr_getenv("MPIRUN_PORT_SCAN_CONNECT_ATTEMPTS", ENV_OPTIONAL))) { mpirun_port_scan_connect_attempts = atoi(value); } /* time to wait between making consecutive connect attempts to a given IP:port (millisecs) */ if ((value = pmgr_getenv("MPIRUN_PORT_SCAN_CONNECT_SLEEP", ENV_OPTIONAL))) { mpirun_port_scan_connect_sleep = atoi(value); } /* initialize PMI library if we're using it, and get rank, ranks, and jobid from PMI */ if (mpirun_pmi_enable) { #ifdef HAVE_PMI /* initialize the PMI library */ int spawned = 0; if (PMI_Init(&spawned) != PMI_SUCCESS) { pmgr_error("Failed to initialize PMI library @ file %s:%d", __FILE__, __LINE__ ); PMI_Abort(1, "Failed to initialize PMI library"); } if (spawned) { pmgr_error("Spawned processes not supported @ file %s:%d", __FILE__, __LINE__ ); PMI_Abort(1, "Spawned processes not supported"); } /* get my rank */ if (PMI_Get_rank(&pmgr_me) != PMI_SUCCESS) { pmgr_error("Getting rank @ file %s:%d", __FILE__, __LINE__ ); PMI_Abort(1, "Failed to get rank from PMI"); } /* get the number of ranks in this job */ if (PMI_Get_size(&pmgr_nprocs) != PMI_SUCCESS) { pmgr_error("Getting number of ranks in job @ file %s:%d", __FILE__, __LINE__ ); PMI_Abort(1, "Failed to get number of ranks in job"); } /* get jobid */ if (PMI_Get_appnum(&pmgr_id) != PMI_SUCCESS) { pmgr_error("Getting job id @ file %s:%d", __FILE__, __LINE__ ); PMI_Abort(1, "Failed to get job id from PMI"); } #endif /* ifdef HAVE_PMI */ } /* ======================================================= * Check that we have valid values * ======================================================= */ /* MPIRUN_CLIENT_DEBUG={0,1} disables/enables debug statements */ /* this comes *after* MPIRUN_RANK and MPIRUN_NPROCS since those are used to print debug messages */ if ((value = pmgr_getenv("MPIRUN_CLIENT_DEBUG", ENV_OPTIONAL)) != NULL) { pmgr_echo_debug = atoi(value); int print_rank = 0; if (pmgr_echo_debug > 0) { if (pmgr_echo_debug <= 1*PMGR_DEBUG_LEVELS) { print_rank = (pmgr_me == 0); /* just rank 0 prints */ } else if (pmgr_echo_debug <= 2*PMGR_DEBUG_LEVELS) { print_rank = (pmgr_me == 0 || pmgr_me == pmgr_nprocs-1); /* just rank 0 and rank N-1 print */ } else { print_rank = 1; /* all ranks print */ } if (print_rank) { pmgr_echo_debug = 1 + (pmgr_echo_debug-1) % PMGR_DEBUG_LEVELS; } else { pmgr_echo_debug = 0; } } } /* check that we have a valid number of processes */ if (pmgr_nprocs <= 0) { pmgr_error("Invalid NPROCS %s @ file %s:%d", pmgr_nprocs, __FILE__, __LINE__ ); exit(1); } /* check that our rank is valid */ if (pmgr_me < 0 || pmgr_me >= pmgr_nprocs) { pmgr_error("Invalid RANK %s @ file %s:%d", pmgr_me, __FILE__, __LINE__ ); exit(1); } /* check that we have a valid jobid */ if (pmgr_id == 0) { pmgr_error("Invalid JOBID %s @ file %s:%d", pmgr_id, __FILE__, __LINE__ ); exit(1); } /* set parameters */ *np_p = pmgr_nprocs; *me_p = pmgr_me; *id_p = pmgr_id; pmgr_gettimeofday(&end); pmgr_debug(2, "Exiting pmgr_init(), took %f seconds for %d procs", pmgr_getsecs(&end,&start), pmgr_nprocs); return PMGR_SUCCESS; }
/* * Call into the process spawner, using the same port we were given * at startup time, to tell it to abort the entire job. */ int pmgr_abort(int code, const char *fmt, ...) { int s; struct sockaddr_in sin; struct hostent* he; va_list ap; char buf [256]; int len; /* if the tree is open, send out abort messages to parent and children */ pmgr_abort_trees(); /* build our error message */ va_start(ap, fmt); vprint_msg(buf, sizeof(buf), fmt, ap); va_end(ap); /* check whether we have an mpirun process, and check whether we can connect back to it */ if (mpirun_hostname != NULL && !mpirun_pmi_enable && !(mpirun_shm_enable && pmgr_nprocs >= mpirun_shm_threshold)) { he = gethostbyname(mpirun_hostname); if (!he) { pmgr_error("pmgr_abort: Hostname lookup of mpirun failed (gethostbyname(%s) %s h_errno=%d) @ file %s:%d", mpirun_hostname, hstrerror(h_errno), h_errno, __FILE__, __LINE__ ); return -1; } s = socket(AF_INET, SOCK_STREAM, 0); if (s < 0) { pmgr_error("pmgr_abort: Failed to create socket (socket() %m errno=%d) @ file %s:%d", errno, __FILE__, __LINE__ ); return -1; } memset(&sin, 0, sizeof(sin)); sin.sin_family = he->h_addrtype; memcpy(&sin.sin_addr, he->h_addr_list[0], sizeof(sin.sin_addr)); sin.sin_port = htons(mpirun_port); if (connect(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) { pmgr_error("pmgr_abort: Connect to mpirun failed (connect() %m errno=%d) @ file %s:%d", errno, __FILE__, __LINE__ ); return -1; } /* write an abort code (may be destination rank), our rank to mpirun */ pmgr_write_fd(s, &code, sizeof(code)); pmgr_write_fd(s, &pmgr_me, sizeof(pmgr_me)); /* now length of error string, and error string itself to mpirun */ len = strlen(buf) + 1; pmgr_write_fd(s, &len, sizeof(len)); pmgr_write_fd(s, buf, len); close(s); } else { /* check that (mpirun_hostname != NULL) */ /* TODO: want to echo this message here? Want to do this for every user abort? */ pmgr_error("Called pmgr_abort() Code: %d, Msg: %s", code, buf); } if (mpirun_pmi_enable) { #ifdef HAVE_PMI PMI_Abort(code, buf); #endif /* ifdef HAVE_PMI */ } return PMGR_SUCCESS; }
static int s1_abort(int flag, const char msg[]) { PMI_Abort(flag, msg); return OPAL_SUCCESS; }
int ompi_rte_abort_peers(ompi_process_name_t *procs, size_t nprocs) { PMI_Abort(1, ""); return OMPI_SUCCESS; }