static void __upc_notify_debugger_of_abort (const char *mesg) { MPIR_debug_abort_string = mesg; MPIR_debug_state = MPIR_DEBUG_ABORTING; MPIR_Breakpoint (); }
void tv_complete(void) { const int MPIR_DEBUG_SPAWNED = 1; MPIR_debug_state = MPIR_DEBUG_SPAWNED; MPIR_Breakpoint(); close(tv_socket); }
/* * Call this routine to signal to the debugger that the application is aborting. * If there is an abort message, call the MPIR_Breakpoint routine (which * allows a tool such as a debugger to gain control. */ void MPIR_DebuggerSetAborting( const char *msg ) { MPIR_debug_abort_string = (char *)msg; MPIR_debug_state = MPIR_DEBUG_ABORTING; #ifdef MPIU_BREAKPOINT_NEEDED if (msg) MPIR_Breakpoint(); #endif }
void * abort_debugger ( void* arg ) { if ( MPIR_being_debugged ) { MPIR_debug_state = MPIR_DEBUG_ABORTING; MPIR_Breakpoint(); } return NULL; }
/* * Tell the debugger that we are about to abort */ void ompi_debugger_notify_abort(char *reason) { MPIR_debug_state = MPIR_DEBUG_ABORTING; if (NULL != reason && strlen(reason) > 0) { MPIR_debug_abort_string = reason; } else { MPIR_debug_abort_string = "Unknown"; } /* Now tell the debugger */ MPIR_Breakpoint(); }
static void __upc_run_this_thread (upc_info_p u, int argc, char *argv[], unsigned int thread_id) { int status; MYTHREAD = thread_id; /* Perform per thread initialization. */ __upc_per_thread_init (u); if (THREADS == 1) { /* A single thread is handled as a special case. No child process is created to run the thread. */ MPIR_being_debugged = 0; /* Give the debugger a chance to pick up runtime info. */ MPIR_Breakpoint (); /* It is safe to unlink the temporary file, after the breakpoint is hit. This gives the debugger a chance to open the mmap global memory file so that it can access UPC shared memory. */ if (unlink (u->mmap_file_name) < 0) { perror ("cannot unlink global shared memory file"); abort (); } } else if (MPIR_being_debugged) { /* Wait for partial attach flag. */ while (!u->partial_attach_start) __upc_yield_cpu (); /* Wait for the debugger to acquire us. */ while (!MPIR_debug_gate) __upc_yield_cpu (); } #if GUPCR_HAVE_GUM_DEBUG if (__upc_gum_debug) { __upc_gum_init (THREADS, thread_id); } #endif __upc_barrier (GUPCR_RUNTIME_BARRIER_ID); __upc_pupc_init (&argc, &argv); status = GUPCR_MAIN (argc, argv); p_startx (GASP_UPC_COLLECTIVE_EXIT, status); p_endx (GASP_UPC_COLLECTIVE_EXIT, status); __upc_exit (status); }
void * setup_debugger ( void* arg ) { int i; char *tmphn; MPIR_proctable_size = myopt.pcount; if ( MPIR_being_debugged ) { MPIR_debug_state = MPIR_DEBUG_SPAWNED; MPIR_proctable = (MPIR_PROCDESC*) malloc (MPIR_proctable_size * sizeof(MPIR_PROCDESC)); tmphn = (char*) malloc (21); for (i = 0; i < MPIR_proctable_size; i++ ) { if ( i/10 == 0 ) sprintf ( tmphn, "virtualmachine00000%d", i); else if ( i/100 == 0 ) sprintf ( tmphn, "virtualmachine0000%d", i); else if ( i/1000 == 0 ) sprintf ( tmphn, "virtualmachine000%d", i); else if ( i/10000 == 0 ) sprintf ( tmphn, "virtualmachine00%d", i); else if ( i/100000 == 0 ) sprintf ( tmphn, "virtualmachine0%d", i); else sprintf ( tmphn, "virtualmachine%d", i); MPIR_proctable[i].host_name = strdup (tmphn); MPIR_proctable[i].executable_name = strdup("appX"); MPIR_proctable[i].pid = i; } totalview_jobid = strdup("387266"); MPIR_Breakpoint(); } return NULL; }
extern int launch_p_step_launch( srun_job_t *job, slurm_step_io_fds_t *cio_fds, uint32_t *global_rc, slurm_step_launch_callbacks_t *step_callbacks) { slurm_step_launch_params_t launch_params; slurm_step_launch_callbacks_t callbacks; int rc = 0; bool first_launch = 0; slurm_step_launch_params_t_init(&launch_params); memcpy(&callbacks, step_callbacks, sizeof(callbacks)); if (!task_state) { task_state = task_state_create(job->ntasks); local_srun_job = job; local_global_rc = global_rc; first_launch = 1; } else task_state_alter(task_state, job->ntasks); launch_params.gid = opt.gid; launch_params.alias_list = job->alias_list; launch_params.argc = opt.argc; launch_params.argv = opt.argv; launch_params.multi_prog = opt.multi_prog ? true : false; launch_params.cwd = opt.cwd; launch_params.slurmd_debug = opt.slurmd_debug; launch_params.buffered_stdio = !opt.unbuffered; launch_params.labelio = opt.labelio ? true : false; launch_params.remote_output_filename =fname_remote_string(job->ofname); launch_params.remote_input_filename = fname_remote_string(job->ifname); launch_params.remote_error_filename = fname_remote_string(job->efname); launch_params.task_prolog = opt.task_prolog; launch_params.task_epilog = opt.task_epilog; launch_params.cpu_bind = opt.cpu_bind; launch_params.cpu_bind_type = opt.cpu_bind_type; launch_params.mem_bind = opt.mem_bind; launch_params.mem_bind_type = opt.mem_bind_type; launch_params.open_mode = opt.open_mode; if (opt.acctg_freq >= 0) launch_params.acctg_freq = opt.acctg_freq; launch_params.pty = opt.pty; if (opt.cpus_set) launch_params.cpus_per_task = opt.cpus_per_task; else launch_params.cpus_per_task = 1; launch_params.cpu_freq = opt.cpu_freq; launch_params.task_dist = opt.distribution; launch_params.ckpt_dir = opt.ckpt_dir; launch_params.restart_dir = opt.restart_dir; launch_params.preserve_env = opt.preserve_env; launch_params.spank_job_env = opt.spank_job_env; launch_params.spank_job_env_size = opt.spank_job_env_size; launch_params.user_managed_io = opt.user_managed_io; memcpy(&launch_params.local_fds, cio_fds, sizeof(slurm_step_io_fds_t)); if (MPIR_being_debugged) { launch_params.parallel_debug = true; pmi_server_max_threads(1); } else { launch_params.parallel_debug = false; } /* Normally this isn't used, but if an outside process (other than srun (poe) is using this logic to launch tasks then we can use this to signal the step. */ callbacks.task_start = _task_start; /* If poe is using this code with multi-prog it always returns 1 for each task which could be confusing since no real error happened. */ if (!launch_params.multi_prog || (!callbacks.step_signal || (callbacks.step_signal == launch_g_fwd_signal))) { callbacks.task_finish = _task_finish; } mpir_init(job->ctx_params.task_count); update_job_state(job, SRUN_JOB_LAUNCHING); launch_start_time = time(NULL); if (first_launch) { if (slurm_step_launch(job->step_ctx, &launch_params, &callbacks) != SLURM_SUCCESS) { rc = errno; *local_global_rc = errno; error("Application launch failed: %m"); slurm_step_launch_abort(job->step_ctx); slurm_step_launch_wait_finish(job->step_ctx); goto cleanup; } } else { if (slurm_step_launch_add(job->step_ctx, &launch_params, job->nodelist, job->fir_nodeid) != SLURM_SUCCESS) { rc = errno; *local_global_rc = errno; error("Application launch add failed: %m"); slurm_step_launch_abort(job->step_ctx); slurm_step_launch_wait_finish(job->step_ctx); goto cleanup; } } update_job_state(job, SRUN_JOB_STARTING); if (slurm_step_launch_wait_start(job->step_ctx) == SLURM_SUCCESS) { update_job_state(job, SRUN_JOB_RUNNING); /* Only set up MPIR structures if the step launched * correctly. */ if (opt.multi_prog) mpir_set_multi_name(job->ctx_params.task_count, launch_params.argv[0]); else mpir_set_executable_names(launch_params.argv[0]); MPIR_debug_state = MPIR_DEBUG_SPAWNED; if (opt.debugger_test) mpir_dump_proctable(); else MPIR_Breakpoint(job); } else { info("Job step %u.%u aborted before step completely launched.", job->jobid, job->stepid); } cleanup: return rc; }
/** * Initialization of data structures for running under a debugger * using the MPICH/TotalView parallel debugger interface. This stage * of initialization must occur after spawn * * NOTE: We -always- perform this step to ensure that any debugger * that attaches to us post-launch of the application can get a * completed proctable */ void orte_debugger_init_after_spawn(orte_job_t *jdata) { orte_proc_t *proc; orte_app_context_t *appctx; orte_vpid_t i, j; opal_buffer_t buf; orte_process_name_t rank0; int rc; if (MPIR_proctable) { /* already initialized */ return; } /* fill in the proc table for the application processes */ if (orte_debug_flag) { opal_output(0, "Info: Setting up debugger process table for applications\n"); } MPIR_debug_state = 1; /* set the total number of processes in the job */ MPIR_proctable_size = jdata->num_procs; /* allocate MPIR_proctable */ MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) * MPIR_proctable_size); if (MPIR_proctable == NULL) { opal_output(0, "Error: Out of memory\n"); return; } /* initialize MPIR_proctable */ for (j=0; j < jdata->num_procs; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { continue; } /* store this data in the location whose index * corresponds to the proc's rank */ i = proc->name.vpid; if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) { continue; } MPIR_proctable[i].host_name = strdup(proc->node->name); if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) { MPIR_proctable[i].executable_name = opal_os_path( false, appctx->app, NULL ); } else { MPIR_proctable[i].executable_name = opal_os_path( false, appctx->cwd, appctx->app, NULL ); } MPIR_proctable[i].pid = proc->pid; } if (orte_debug_flag) { dump(); } /* if we are being launched under a debugger, then we must wait * for it to be ready to go and do some things to start the job */ if (MPIR_being_debugged) { /* wait for all procs to have reported their contact info - this * ensures that (a) they are all into mpi_init, and (b) the system * has the contact info to successfully send a message to rank=0 */ ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs); (void) MPIR_Breakpoint(); /* send a message to rank=0 to release it */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */ rank0.jobid = jdata->jobid; rank0.vpid = 0; if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) { opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); } OBJ_DESTRUCT(&buf); } }
static void check_debugger(int fd, short event, void *arg) { struct timeval now; opal_event_t *tmp = (opal_event_t*)arg; orte_job_t *jdata; orte_app_context_t *app; char cwd[OPAL_PATH_MAX]; int rc; int32_t ljob; if (MPIR_being_debugged) { if (orte_debug_flag) { opal_output(0, "%s Launching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), MPIR_executable_path); } /* a debugger has attached! All the MPIR_Proctable * data is already available, so we only need to * check to see if we should spawn any daemons */ if ('\0' != MPIR_executable_path[0]) { /* this will be launched just like a regular job, * so we do not use the global orte_debugger_daemon * as this is reserved for co-location upon startup */ jdata = OBJ_NEW(orte_job_t); /* create a jobid for these daemons - this is done solely * to avoid confusing the rest of the system's bookkeeping */ orte_plm_base_create_jobid(jdata); /* flag the job as being debugger daemons */ jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON; /* unless directed, we do not forward output */ if (!MPIR_forward_output) { jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT; } /* set the mapping policy to "pernode" so we only get * one debugger daemon on each node */ jdata->map = OBJ_NEW(orte_job_map_t); jdata->map->npernode = 1; /* add it to the global job pool */ ljob = ORTE_LOCAL_JOBID(jdata->jobid); opal_pointer_array_set_item(orte_job_data, ljob, jdata); /* create an app_context for the debugger daemon */ app = OBJ_NEW(orte_app_context_t); app->app = strdup((char*)MPIR_executable_path); if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) { orte_show_help("help-orterun.txt", "orterun:init-failure", true, "get the cwd", rc); OBJ_RELEASE(jdata); goto RELEASE; } app->cwd = strdup(cwd); app->user_specified_cwd = false; opal_argv_append_nosize(&app->argv, app->app); build_debugger_args(app); opal_pointer_array_add(jdata->apps, &app->super); jdata->num_apps = 1; /* now go ahead and spawn this job */ if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { ORTE_ERROR_LOG(rc); } } RELEASE: /* notify the debugger that all is ready */ MPIR_Breakpoint(); } else { /* reissue the timer to wake us up again */ now.tv_sec = orte_debugger_check_rate; now.tv_usec = 0; opal_evtimer_add(tmp, &now); } }
/** * Initialization of data structures for running under a debugger * using the MPICH/TotalView parallel debugger interface. This stage * of initialization must occur after stage2 of spawn and is invoked * via a callback. * * @param jobid The jobid returned by spawn. */ void orte_totalview_init_after_spawn(orte_jobid_t jobid) { orte_job_map_t *map; opal_list_item_t *item, *item2; orte_mapped_node_t *node; orte_mapped_proc_t *proc; orte_app_context_t *appctx; orte_std_cntr_t i; int rc; if (MPIR_proctable) { /* already initialized */ return; } if (0) { /* debugging daemons <<-- needs work */ if (orte_debug_flag) { opal_output(0, "Info: Setting up debugger process table for daemons\n"); } } else { /* * Debugging applications or not being debugged. * * Either way, fill in the proc table for the application * processes in case someone attaches later. */ if (orte_debug_flag) { opal_output(0, "Info: Setting up debugger process table for applications\n"); } MPIR_debug_state = 1; /* Get the resource map for this job */ rc = orte_rmaps.get_job_map(&map, jobid); if (ORTE_SUCCESS != rc) { opal_output(0, "Error: Can't get resource map\n"); ORTE_ERROR_LOG(rc); } /* find the total number of processes in the job */ for (i=0; i < map->num_apps; i++) { MPIR_proctable_size += map->apps[i]->num_procs; } /* allocate MPIR_proctable */ MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) * MPIR_proctable_size); if (MPIR_proctable == NULL) { opal_output(0, "Error: Out of memory\n"); OBJ_RELEASE(map); } /* initialize MPIR_proctable */ for (item = opal_list_get_first(&map->nodes); item != opal_list_get_end(&map->nodes); item = opal_list_get_next(item)) { node = (orte_mapped_node_t*)item; for (item2 = opal_list_get_first(&node->procs); item2 != opal_list_get_end(&node->procs); item2 = opal_list_get_next(item2)) { proc = (orte_mapped_proc_t*)item2; appctx = map->apps[proc->app_idx]; /* store this data in the location whose index * corresponds to the proc's rank */ i = proc->rank; MPIR_proctable[i].host_name = strdup(node->nodename); if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) { MPIR_proctable[i].executable_name = opal_os_path( false, appctx->app, NULL ); } else { MPIR_proctable[i].executable_name = opal_os_path( false, appctx->cwd, appctx->app, NULL ); } MPIR_proctable[i].pid = proc->pid; } } OBJ_RELEASE(map); } if (orte_debug_flag) { dump(); } (void) MPIR_Breakpoint(); }
/* * complete_spawn - Tell the debugger that all the information is ready to be consumed. */ PyObject *complete_spawn (void) { MPIR_debug_state = MPIR_DEBUG_SPAWNED; MPIR_Breakpoint(); return Py_BuildValue(""); /* same as None */ } /* complete_spawn */
static void attach_debugger(int fd, short event, void *arg) { orte_app_context_t *app; unsigned char fifo_cmd; int rc; int32_t ljob; orte_job_t *jdata; /* read the file descriptor to clear that event, if necessary */ if (fifo_active) { opal_event_del(&attach); fifo_active = false; rc = read(attach_fd, &fifo_cmd, sizeof(fifo_cmd)); if (!rc) { /* reopen device to clear hangup */ open_fifo(); return; } if (1 != fifo_cmd) { /* ignore the cmd */ goto RELEASE; } } if (!MPIR_being_debugged && !orte_debugger_base.test_attach) { /* false alarm */ goto RELEASE; } opal_output_verbose(1, orte_debugger_base.output, "%s Attaching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_debugger_base.test_daemon) ? MPIR_executable_path : orte_debugger_base.test_daemon); /* a debugger has attached! All the MPIR_Proctable * data is already available, so we only need to * check to see if we should spawn any daemons */ if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_base.test_daemon) { /* can only have one debugger */ if (NULL != orte_debugger_daemon) { opal_output(0, "-------------------------------------------\n" "Only one debugger can be used on a job.\n" "-------------------------------------------\n"); goto RELEASE; } opal_output_verbose(2, orte_debugger_base.output, "%s Spawning debugger daemons %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_debugger_base.test_daemon) ? MPIR_executable_path : orte_debugger_base.test_daemon); /* this will be launched just like a regular job, * so we do not use the global orte_debugger_daemon * as this is reserved for co-location upon startup */ jdata = OBJ_NEW(orte_job_t); /* create a jobid for these daemons - this is done solely * to avoid confusing the rest of the system's bookkeeping */ orte_plm_base_create_jobid(jdata); /* flag the job as being debugger daemons */ jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON; /* unless directed, we do not forward output */ if (!MPIR_forward_output) { jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT; } /* add it to the global job pool */ ljob = ORTE_LOCAL_JOBID(jdata->jobid); opal_pointer_array_set_item(orte_job_data, ljob, jdata); /* create an app_context for the debugger daemon */ app = OBJ_NEW(orte_app_context_t); if (NULL != orte_debugger_base.test_daemon) { app->app = strdup(orte_debugger_base.test_daemon); } else { app->app = strdup((char*)MPIR_executable_path); } jdata->state = ORTE_JOB_STATE_INIT; opal_argv_append_nosize(&app->argv, app->app); build_debugger_args(app); opal_pointer_array_add(jdata->apps, app); jdata->num_apps = 1; /* setup the mapping policy to bynode so we get one * daemon on each node */ jdata->map = OBJ_NEW(orte_job_map_t); jdata->map->policy = ORTE_MAPPING_BYNODE; jdata->map->npernode = 1; /* now go ahead and spawn this job */ if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { ORTE_ERROR_LOG(rc); } } RELEASE: /* reset the read or timer event */ if (0 == orte_debugger_mpirx_check_rate) { fifo_active = true; opal_event_add(&attach, 0); } else if (!MPIR_being_debugged) { ORTE_TIMER_EVENT(orte_debugger_mpirx_check_rate, 0, attach_debugger); } /* notify the debugger that all is ready */ MPIR_Breakpoint(); }
/* * If MPICH is built with the --enable-debugger option, MPI_Init and * MPI_Init_thread will call MPIR_WaitForDebugger. This ensures both that * the debugger can gather information on the MPI job before the MPI_Init * returns to the user and that the necessary symbols for providing * information such as message queues is available. * * In addition, the environment variable MPIEXEC_DEBUG, if set, will cause * all MPI processes to wait in this routine until the variable * MPIR_debug_gate is set to 1. */ void MPIR_WaitForDebugger( void ) { #ifdef MPIU_PROCTABLE_NEEDED int rank = MPIR_Process.comm_world->rank; #if defined(FINEGRAIN_MPI) int size = MPIR_Process.comm_world->num_osprocs; #else int size = MPIR_Process.comm_world->local_size; #endif int i, maxsize; /* FIXME: In MPICH, the executables may not have the information on the other processes; this is part of the Process Manager Interface (PMI). We need another way to provide this information to a debugger */ /* The process manager probably has all of this data - the MPI2 debugger interface API provides (at least originally) a way to access this. */ /* Also, to avoid scaling problems, we only populate the first 64 entries (default) */ maxsize = MPIR_CVAR_PROCTABLE_SIZE; if (maxsize > size) maxsize = size; if (rank == 0) { char hostname[MPI_MAX_PROCESSOR_NAME+1]; int hostlen; int val; MPIR_proctable = (MPIR_PROCDESC *)MPIU_Malloc( size * sizeof(MPIR_PROCDESC) ); for (i=0; i<size; i++) { /* Initialize the proctable */ MPIR_proctable[i].host_name = 0; MPIR_proctable[i].executable_name = 0; MPIR_proctable[i].pid = -1; } PMPI_Get_processor_name( hostname, &hostlen ); MPIR_proctable[0].host_name = (char *)MPIU_Strdup( hostname ); MPIR_proctable[0].executable_name = 0; MPIR_proctable[0].pid = getpid(); for (i=1; i<maxsize; i++) { int msg[2]; PMPI_Recv( msg, 2, MPI_INT, i, 0, MPI_COMM_WORLD,MPI_STATUS_IGNORE); MPIR_proctable[i].pid = msg[1]; MPIR_proctable[i].host_name = (char *)MPIU_Malloc( msg[0] + 1 ); PMPI_Recv( MPIR_proctable[i].host_name, msg[0]+1, MPI_CHAR, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); MPIR_proctable[i].host_name[msg[0]] = 0; } MPIR_proctable_size = size; /* Debugging hook */ if (MPIR_CVAR_PROCTABLE_PRINT) { for (i=0; i<maxsize; i++) { printf( "PT[%d].pid = %d, .host_name = %s\n", i, MPIR_proctable[i].pid, MPIR_proctable[i].host_name ); } fflush( stdout ); } MPIR_Add_finalize( MPIR_FreeProctable, MPIR_proctable, 0 ); } else { char hostname[MPI_MAX_PROCESSOR_NAME+1]; int hostlen; int mypid = getpid(); int msg[2]; if (rank < maxsize) { PMPI_Get_processor_name( hostname, &hostlen ); msg[0] = hostlen; msg[1] = mypid; /* Deliver to the root process the proctable information */ PMPI_Ssend( msg, 2, MPI_INT, 0, 0, MPI_COMM_WORLD ); PMPI_Ssend( hostname, hostlen, MPI_CHAR, 0, 0, MPI_COMM_WORLD ); } } #endif /* MPIU_PROCTABLE_NEEDED */ /* Put the breakpoint after setting up the proctable */ MPIR_debug_state = MPIR_DEBUG_SPAWNED; #ifdef MPIU_BREAKPOINT_NEEDED (void)MPIR_Breakpoint(); #endif /* After we exit the MPIR_Breakpoint routine, the debugger may have set variables such as MPIR_being_debugged */ /* Initialize the sendq support */ SendqInit(); if (getenv("MPIEXEC_DEBUG")) { while (!MPIR_debug_gate) ; } }
static int __upc_monitor_threads (void) { upc_info_p u = __upc_info; pid_t pid; int wait_status; int exit_status; int thread_id; int global_exit_invoked; struct sigaction action; exit_status = -1; global_exit_invoked = 0; /* Install SIGTERM handler responsible for terminating the whole program. */ action.sa_handler = __upc_sigterm_handler; sigemptyset (&action.sa_mask); action.sa_flags = 0; sigaction (SIGTERM, &action, NULL); /* Wait for threads to finish. */ for (;;) { pid = waitpid (-1, &wait_status, WNOHANG); /* Check for errors. */ if (pid == -1) { /* Continue checking if interrupted (handling other signals). */ if (errno == EINTR) continue; /* Stop waiting if no more children. */ if (errno == ECHILD) break; /* Abort if invalid argument. */ if (errno == EINVAL) { perror ("waitpid"); abort (); } } /* Not a child exit? */ if (pid == 0) { /* Check for debugger attach. */ MPIR_Breakpoint (); /* Release the CPU for 100mS and continue checking. */ usleep (100000); continue; } /* Check for child process that exited. */ thread_id = __upc_get_thread_id (pid); if (!global_exit_invoked && WIFEXITED (wait_status)) { int child_exit = WEXITSTATUS (wait_status); if (child_exit & 0x80) { /* By convention, the result of a call to upc_global_exit has the high bit in the byte set. Terminate all the other threads in the program. */ int t; for (t = 0; t < THREADS; ++t) { int pid = u->thread_info[t].pid; if (pid <= 0) abort (); if (t != thread_id) (void) kill (pid, SIGKILL); } child_exit &= 0x7f; global_exit_invoked = 1; } else if ((exit_status != -1) && exit_status != child_exit) { fprintf (stderr, "conflicting exit status (%d) for" " thread %d\n", child_exit, thread_id); } exit_status = child_exit; } else if (WIFSIGNALED (wait_status)) { int child_sig = WTERMSIG (wait_status); /* Ignore SIGKILL signals. We use them to implement upc_global_exit(). */ if (child_sig == SIGKILL && global_exit_invoked) continue; fprintf (stderr, "thread %d terminated with signal: '%s'\n", thread_id, __upc_strsignal (child_sig)); /* GASP note: We can't record a noncollective GASP exit event here, because the process has already died. */ /* We'll all go away now. */ if (killpg (getpid (), SIGTERM) == -1) { perror ("killpg"); exit (-1); } } } return exit_status; }
/* Implement UPC threads as processes. */ static void __upc_run_threads (upc_info_p u, int argc, char *argv[]) { int thread_id; int flag; /* Set O_APPEND on stdout and stderr (see Berkeley UPC bug 2136). */ flag = fcntl (STDOUT_FILENO, F_GETFL, 0); if (flag >= 0) (void) fcntl (STDOUT_FILENO, F_SETFL, flag | O_APPEND); flag = fcntl (STDERR_FILENO, F_GETFL, 0); if (flag >= 0) (void) fcntl (STDERR_FILENO, F_SETFL, flag | O_APPEND); if (THREADS == 1) { #if GUPCR_HAVE_OMP_CHECKS __upc_omp_master_id = pthread_self (); #endif __upc_affinity_set (u, 0); __upc_run_this_thread (u, argc, argv, 0); /* Shouldn't get here. */ abort (); } /* In case a debugger is using the value; we don't want it to see two thread zeros */ MYTHREAD = -1; /* Allocate space to tell the debugger about the process we're creating */ MPIR_proctable = malloc (THREADS * sizeof (*MPIR_proctable)); /* Tell the debugger this process is a starter process. */ MPIR_i_am_starter (); for (thread_id = 0; thread_id < THREADS; ++thread_id) { pid_t pid = fork (); if (pid == 0) { /* child */ #if GUPCR_HAVE_OMP_CHECKS __upc_omp_master_id = pthread_self (); #endif __upc_affinity_set (u, thread_id); __upc_run_this_thread (u, argc, argv, thread_id); } else if (pid > 0) { /* parent */ u->thread_info[thread_id].pid = pid; if (MPIR_being_debugged) { MPIR_proctable[thread_id].host_name = u->host_name; MPIR_proctable[thread_id].executable_name = u->program_name; MPIR_proctable[thread_id].pid = pid; } } else { /* error */ perror ("fork"); exit (2); } } /* We're the main process, there are child processes and they're all started. * Let the debugger know about that. */ if (MPIR_being_debugged) { MPIR_proctable_size = THREADS; MPIR_debug_state = MPIR_DEBUG_SPAWNED; /* The debugger will have set a breakpoint there... */ MPIR_Breakpoint (); /* Release threads. */ u->partial_attach_start = 1; } if (unlink (u->mmap_file_name) < 0) { perror ("cannot unlink global shared memory file"); abort (); } }