Ejemplo n.º 1
0
static void
__upc_notify_debugger_of_abort (const char *mesg)
{
  MPIR_debug_abort_string = mesg;
  MPIR_debug_state = MPIR_DEBUG_ABORTING;
  MPIR_Breakpoint ();
}
Ejemplo n.º 2
0
void tv_complete(void)
{
    const int MPIR_DEBUG_SPAWNED = 1;

    MPIR_debug_state = MPIR_DEBUG_SPAWNED;
    MPIR_Breakpoint();
    close(tv_socket);
}
Ejemplo n.º 3
0
/* 
 * Call this routine to signal to the debugger that the application is aborting.
 * If there is an abort message, call the MPIR_Breakpoint routine (which 
 * allows a tool such as a debugger to gain control.
 */
void MPIR_DebuggerSetAborting( const char *msg )
{
    MPIR_debug_abort_string = (char *)msg;
    MPIR_debug_state        = MPIR_DEBUG_ABORTING;
#ifdef MPIU_BREAKPOINT_NEEDED
    if (msg) 
	MPIR_Breakpoint();
#endif
}
Ejemplo n.º 4
0
void *
abort_debugger ( void* arg )
{
    if ( MPIR_being_debugged )
    {
        MPIR_debug_state = MPIR_DEBUG_ABORTING;
        MPIR_Breakpoint();
    }

    return NULL;
}
/*
 * Tell the debugger that we are about to abort
 */
void ompi_debugger_notify_abort(char *reason)
{
    MPIR_debug_state = MPIR_DEBUG_ABORTING;

    if (NULL != reason && strlen(reason) > 0) {
        MPIR_debug_abort_string = reason;
    } else {
        MPIR_debug_abort_string = "Unknown";
    }

    /* Now tell the debugger */
    MPIR_Breakpoint();
}
Ejemplo n.º 6
0
static void
__upc_run_this_thread (upc_info_p u, int argc, char *argv[],
		       unsigned int thread_id)
{
  int status;
  MYTHREAD = thread_id;
  /* Perform per thread initialization.  */
  __upc_per_thread_init (u);
  if (THREADS == 1)
    {
      /* A single thread is handled as a special case.
         No child process is created to run the thread. */
      MPIR_being_debugged = 0;
      /* Give the debugger a chance to pick up runtime info.  */
      MPIR_Breakpoint ();
      /* It is safe to unlink the temporary file, after the breakpoint
         is hit.  This gives the debugger a chance to open the mmap
         global memory file so that it can access UPC shared memory.  */
      if (unlink (u->mmap_file_name) < 0)
	{
	  perror ("cannot unlink global shared memory file");
	  abort ();
	}
    }
  else if (MPIR_being_debugged)
    {
      /* Wait for partial attach flag.  */
      while (!u->partial_attach_start)
	__upc_yield_cpu ();
    
      /* Wait for the debugger to acquire us.  */
      while (!MPIR_debug_gate)
	__upc_yield_cpu ();
    }
#if GUPCR_HAVE_GUM_DEBUG
  if (__upc_gum_debug)
    {
      __upc_gum_init (THREADS, thread_id);
    }
#endif
  __upc_barrier (GUPCR_RUNTIME_BARRIER_ID);
  __upc_pupc_init (&argc, &argv);
  status = GUPCR_MAIN (argc, argv);
  p_startx (GASP_UPC_COLLECTIVE_EXIT, status);
  p_endx (GASP_UPC_COLLECTIVE_EXIT, status);
  __upc_exit (status);
}
Ejemplo n.º 7
0
void *
setup_debugger ( void* arg )
{
    int i;
    char *tmphn;
    MPIR_proctable_size = myopt.pcount;

    if ( MPIR_being_debugged )
    {
        MPIR_debug_state = MPIR_DEBUG_SPAWNED;
        MPIR_proctable = (MPIR_PROCDESC*)
                         malloc (MPIR_proctable_size * sizeof(MPIR_PROCDESC));
        tmphn = (char*) malloc (21);

        for (i = 0;  i < MPIR_proctable_size; i++ )
        {
            if ( i/10 == 0 )
                sprintf ( tmphn, "virtualmachine00000%d", i);
            else if ( i/100 == 0 )
                sprintf ( tmphn, "virtualmachine0000%d", i);
            else if ( i/1000 == 0 )
                sprintf ( tmphn, "virtualmachine000%d", i);
            else if ( i/10000 == 0 )
                sprintf ( tmphn, "virtualmachine00%d", i);
            else if ( i/100000 == 0 )
                sprintf ( tmphn, "virtualmachine0%d", i);
            else
                sprintf ( tmphn, "virtualmachine%d", i);

            MPIR_proctable[i].host_name = strdup (tmphn);
            MPIR_proctable[i].executable_name = strdup("appX");
            MPIR_proctable[i].pid = i;
        }
        totalview_jobid = strdup("387266");
        MPIR_Breakpoint();
    }

    return NULL;
}
Ejemplo n.º 8
0
extern int launch_p_step_launch(
	srun_job_t *job, slurm_step_io_fds_t *cio_fds, uint32_t *global_rc,
	slurm_step_launch_callbacks_t *step_callbacks)
{
	slurm_step_launch_params_t launch_params;
	slurm_step_launch_callbacks_t callbacks;
	int rc = 0;
	bool first_launch = 0;

	slurm_step_launch_params_t_init(&launch_params);
	memcpy(&callbacks, step_callbacks, sizeof(callbacks));

	if (!task_state) {
		task_state = task_state_create(job->ntasks);
		local_srun_job = job;
		local_global_rc = global_rc;
		first_launch = 1;
	} else
		task_state_alter(task_state, job->ntasks);

	launch_params.gid = opt.gid;
	launch_params.alias_list = job->alias_list;
	launch_params.argc = opt.argc;
	launch_params.argv = opt.argv;
	launch_params.multi_prog = opt.multi_prog ? true : false;
	launch_params.cwd = opt.cwd;
	launch_params.slurmd_debug = opt.slurmd_debug;
	launch_params.buffered_stdio = !opt.unbuffered;
	launch_params.labelio = opt.labelio ? true : false;
	launch_params.remote_output_filename =fname_remote_string(job->ofname);
	launch_params.remote_input_filename = fname_remote_string(job->ifname);
	launch_params.remote_error_filename = fname_remote_string(job->efname);
	launch_params.task_prolog = opt.task_prolog;
	launch_params.task_epilog = opt.task_epilog;
	launch_params.cpu_bind = opt.cpu_bind;
	launch_params.cpu_bind_type = opt.cpu_bind_type;
	launch_params.mem_bind = opt.mem_bind;
	launch_params.mem_bind_type = opt.mem_bind_type;
	launch_params.open_mode = opt.open_mode;
	if (opt.acctg_freq >= 0)
		launch_params.acctg_freq = opt.acctg_freq;
	launch_params.pty = opt.pty;
	if (opt.cpus_set)
		launch_params.cpus_per_task	= opt.cpus_per_task;
	else
		launch_params.cpus_per_task	= 1;
	launch_params.cpu_freq          = opt.cpu_freq;
	launch_params.task_dist         = opt.distribution;
	launch_params.ckpt_dir		= opt.ckpt_dir;
	launch_params.restart_dir       = opt.restart_dir;
	launch_params.preserve_env      = opt.preserve_env;
	launch_params.spank_job_env     = opt.spank_job_env;
	launch_params.spank_job_env_size = opt.spank_job_env_size;
	launch_params.user_managed_io   = opt.user_managed_io;

	memcpy(&launch_params.local_fds, cio_fds, sizeof(slurm_step_io_fds_t));

	if (MPIR_being_debugged) {
		launch_params.parallel_debug = true;
		pmi_server_max_threads(1);
	} else {
		launch_params.parallel_debug = false;
	}
	/* Normally this isn't used, but if an outside process (other
	   than srun (poe) is using this logic to launch tasks then we
	   can use this to signal the step.
	*/
	callbacks.task_start = _task_start;
	/* If poe is using this code with multi-prog it always returns
	   1 for each task which could be confusing since no real
	   error happened.
	*/
	if (!launch_params.multi_prog
	    || (!callbacks.step_signal
		|| (callbacks.step_signal == launch_g_fwd_signal))) {
		callbacks.task_finish = _task_finish;
	}

	mpir_init(job->ctx_params.task_count);

	update_job_state(job, SRUN_JOB_LAUNCHING);
	launch_start_time = time(NULL);
	if (first_launch) {
		if (slurm_step_launch(job->step_ctx, &launch_params,
				      &callbacks) != SLURM_SUCCESS) {
			rc = errno;
			*local_global_rc = errno;
			error("Application launch failed: %m");
			slurm_step_launch_abort(job->step_ctx);
			slurm_step_launch_wait_finish(job->step_ctx);
			goto cleanup;
		}
	} else {
		if (slurm_step_launch_add(job->step_ctx, &launch_params,
					  job->nodelist, job->fir_nodeid)
		    != SLURM_SUCCESS) {
			rc = errno;
			*local_global_rc = errno;
			error("Application launch add failed: %m");
			slurm_step_launch_abort(job->step_ctx);
			slurm_step_launch_wait_finish(job->step_ctx);
			goto cleanup;
		}
	}

	update_job_state(job, SRUN_JOB_STARTING);
	if (slurm_step_launch_wait_start(job->step_ctx) == SLURM_SUCCESS) {
		update_job_state(job, SRUN_JOB_RUNNING);
		/* Only set up MPIR structures if the step launched
		 * correctly. */
		if (opt.multi_prog)
			mpir_set_multi_name(job->ctx_params.task_count,
					    launch_params.argv[0]);
		else
			mpir_set_executable_names(launch_params.argv[0]);
		MPIR_debug_state = MPIR_DEBUG_SPAWNED;
		if (opt.debugger_test)
			mpir_dump_proctable();
		else
			MPIR_Breakpoint(job);
	} else {
		info("Job step %u.%u aborted before step completely launched.",
		     job->jobid, job->stepid);
	}

cleanup:

	return rc;
}
Ejemplo n.º 9
0
/**
 * Initialization of data structures for running under a debugger
 * using the MPICH/TotalView parallel debugger interface. This stage
 * of initialization must occur after spawn
 * 
 * NOTE: We -always- perform this step to ensure that any debugger
 * that attaches to us post-launch of the application can get a
 * completed proctable
 */
void orte_debugger_init_after_spawn(orte_job_t *jdata)
{
    orte_proc_t *proc;
    orte_app_context_t *appctx;
    orte_vpid_t i, j;
    opal_buffer_t buf;
    orte_process_name_t rank0;
    int rc;

    if (MPIR_proctable) {
        /* already initialized */
        return;
    }

    /* fill in the proc table for the application processes */
    
    if (orte_debug_flag) {
        opal_output(0, "Info: Setting up debugger process table for applications\n");
    }
    
    MPIR_debug_state = 1;
    
    /* set the total number of processes in the job */
    MPIR_proctable_size = jdata->num_procs;
    
    /* allocate MPIR_proctable */
    MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) *
                                                     MPIR_proctable_size);
    if (MPIR_proctable == NULL) {
        opal_output(0, "Error: Out of memory\n");
        return;
    }
    
    /* initialize MPIR_proctable */
    for (j=0; j < jdata->num_procs; j++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
            continue;
        }
        /* store this data in the location whose index
         * corresponds to the proc's rank
         */
        i = proc->name.vpid;
        if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) {
            continue;
        }
        
        MPIR_proctable[i].host_name = strdup(proc->node->name);
        if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) { 
            MPIR_proctable[i].executable_name = 
            opal_os_path( false, appctx->app, NULL ); 
        } else {
            MPIR_proctable[i].executable_name =
            opal_os_path( false, appctx->cwd, appctx->app, NULL ); 
        } 
        MPIR_proctable[i].pid = proc->pid;
    }

    if (orte_debug_flag) {
        dump();
    }

    /* if we are being launched under a debugger, then we must wait
     * for it to be ready to go and do some things to start the job
     */
    if (MPIR_being_debugged) {
        /* wait for all procs to have reported their contact info - this
         * ensures that (a) they are all into mpi_init, and (b) the system
         * has the contact info to successfully send a message to rank=0
         */
        ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs);
        
        (void) MPIR_Breakpoint();
        
        /* send a message to rank=0 to release it */
        OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */
        rank0.jobid = jdata->jobid;
        rank0.vpid = 0;
        if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) {
            opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc));
        }
        OBJ_DESTRUCT(&buf);
    }
}
Ejemplo n.º 10
0
static void check_debugger(int fd, short event, void *arg)
{
    struct timeval now;
    opal_event_t *tmp = (opal_event_t*)arg;
    orte_job_t *jdata;
    orte_app_context_t *app;
    char cwd[OPAL_PATH_MAX];
    int rc;
    int32_t ljob;

    if (MPIR_being_debugged) {
        if (orte_debug_flag) {
            opal_output(0, "%s Launching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
            MPIR_executable_path);
        }
        
        /* a debugger has attached! All the MPIR_Proctable
         * data is already available, so we only need to
         * check to see if we should spawn any daemons
         */
        if ('\0' != MPIR_executable_path[0]) {
            /* this will be launched just like a regular job,
             * so we do not use the global orte_debugger_daemon
             * as this is reserved for co-location upon startup
             */
            jdata = OBJ_NEW(orte_job_t);
            /* create a jobid for these daemons - this is done solely
             * to avoid confusing the rest of the system's bookkeeping
             */
            orte_plm_base_create_jobid(jdata);
            /* flag the job as being debugger daemons */
            jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
            /* unless directed, we do not forward output */
            if (!MPIR_forward_output) {
                jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
            }
            /* set the mapping policy to "pernode" so we only get
             * one debugger daemon on each node
             */
            jdata->map = OBJ_NEW(orte_job_map_t);
            jdata->map->npernode = 1;
            /* add it to the global job pool */
            ljob = ORTE_LOCAL_JOBID(jdata->jobid);
            opal_pointer_array_set_item(orte_job_data, ljob, jdata);
            /* create an app_context for the debugger daemon */
            app = OBJ_NEW(orte_app_context_t);
            app->app = strdup((char*)MPIR_executable_path);
            if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) {
                orte_show_help("help-orterun.txt", "orterun:init-failure",
                               true, "get the cwd", rc);
                OBJ_RELEASE(jdata);
                goto RELEASE;
            }
            app->cwd = strdup(cwd);
            app->user_specified_cwd = false;
            opal_argv_append_nosize(&app->argv, app->app);
            build_debugger_args(app);
            opal_pointer_array_add(jdata->apps, &app->super);
            jdata->num_apps = 1;
            /* now go ahead and spawn this job */
            if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
                ORTE_ERROR_LOG(rc);
            }
        }
        
    RELEASE:
        /* notify the debugger that all is ready */
        MPIR_Breakpoint();

    } else {
        /* reissue the timer to wake us up again */
        now.tv_sec = orte_debugger_check_rate;
        now.tv_usec = 0;
        opal_evtimer_add(tmp, &now);
    }
}
Ejemplo n.º 11
0
/**
 * Initialization of data structures for running under a debugger
 * using the MPICH/TotalView parallel debugger interface. This stage
 * of initialization must occur after stage2 of spawn and is invoked
 * via a callback.
 * 
 * @param jobid  The jobid returned by spawn.
 */
void orte_totalview_init_after_spawn(orte_jobid_t jobid)
{
    orte_job_map_t *map;
    opal_list_item_t *item, *item2;
    orte_mapped_node_t *node;
    orte_mapped_proc_t *proc;
    orte_app_context_t *appctx;
    orte_std_cntr_t i;
    int rc;

    if (MPIR_proctable) {
        /* already initialized */
        return;
    }

    if (0) { /* debugging daemons <<-- needs work */

        if (orte_debug_flag) {
            opal_output(0, "Info: Setting up debugger process table for daemons\n");
        }

    } else {

        /*
         * Debugging applications or not being debugged.
         *
         * Either way, fill in the proc table for the application
         * processes in case someone attaches later.
         */

        if (orte_debug_flag) {
            opal_output(0, "Info: Setting up debugger process table for applications\n");
        }

        MPIR_debug_state = 1;

        /* Get the resource map for this job */

        rc = orte_rmaps.get_job_map(&map, jobid);
        if (ORTE_SUCCESS != rc) {
            opal_output(0, "Error: Can't get resource map\n");
            ORTE_ERROR_LOG(rc);
        }

        /* find the total number of processes in the job */

        for (i=0; i < map->num_apps; i++) {
            MPIR_proctable_size += map->apps[i]->num_procs;
        }

        /* allocate MPIR_proctable */

        MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) *
                                                         MPIR_proctable_size);
        if (MPIR_proctable == NULL) {
            opal_output(0, "Error: Out of memory\n");
            OBJ_RELEASE(map);
        }

        /* initialize MPIR_proctable */

        for (item =  opal_list_get_first(&map->nodes);
             item != opal_list_get_end(&map->nodes);
             item =  opal_list_get_next(item)) {
            node = (orte_mapped_node_t*)item;
            
            for (item2 = opal_list_get_first(&node->procs);
                 item2 != opal_list_get_end(&node->procs);
                 item2 = opal_list_get_next(item2)) {
                proc = (orte_mapped_proc_t*)item2;
                appctx = map->apps[proc->app_idx];
                /* store this data in the location whose index
                 * corresponds to the proc's rank
                 */
                i = proc->rank;
                
                MPIR_proctable[i].host_name = strdup(node->nodename);
                if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) {
                   MPIR_proctable[i].executable_name =
                     opal_os_path( false, appctx->app, NULL );
                } else {
                   MPIR_proctable[i].executable_name =
                     opal_os_path( false, appctx->cwd, appctx->app, NULL );
                }
                MPIR_proctable[i].pid = proc->pid;
            }
        }
        OBJ_RELEASE(map);
    }

    if (orte_debug_flag) {
        dump();
    }

    (void) MPIR_Breakpoint();
}
Ejemplo n.º 12
0
/*
 * complete_spawn - Tell the debugger that all the information is ready to be consumed.
 */
PyObject *complete_spawn (void)
{
  MPIR_debug_state = MPIR_DEBUG_SPAWNED;
  MPIR_Breakpoint();
  return Py_BuildValue("");    /* same as None */
} /* complete_spawn */
Ejemplo n.º 13
0
static void attach_debugger(int fd, short event, void *arg)
{
    orte_app_context_t *app;
    unsigned char fifo_cmd;
    int rc;
    int32_t ljob;
    orte_job_t *jdata;

    /* read the file descriptor to clear that event, if necessary */
    if (fifo_active) {
	opal_event_del(&attach);
	fifo_active = false;

        rc = read(attach_fd, &fifo_cmd, sizeof(fifo_cmd));
	if (!rc) {
	    /* reopen device to clear hangup */
	    open_fifo();
	    return;
	}
        if (1 != fifo_cmd) {
            /* ignore the cmd */
            goto RELEASE;
        }
    }

    if (!MPIR_being_debugged && !orte_debugger_base.test_attach) {
        /* false alarm */
        goto RELEASE;
    }

    opal_output_verbose(1, orte_debugger_base.output,
                        "%s Attaching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        (NULL == orte_debugger_base.test_daemon) ? MPIR_executable_path : orte_debugger_base.test_daemon);

    /* a debugger has attached! All the MPIR_Proctable
     * data is already available, so we only need to
     * check to see if we should spawn any daemons
     */
    if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_base.test_daemon) {
        /* can only have one debugger */
        if (NULL != orte_debugger_daemon) {
            opal_output(0, "-------------------------------------------\n"
                        "Only one debugger can be used on a job.\n"
                        "-------------------------------------------\n");
            goto RELEASE;
        }
        opal_output_verbose(2, orte_debugger_base.output,
                            "%s Spawning debugger daemons %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            (NULL == orte_debugger_base.test_daemon) ?
                            MPIR_executable_path : orte_debugger_base.test_daemon);
        /* this will be launched just like a regular job,
         * so we do not use the global orte_debugger_daemon
         * as this is reserved for co-location upon startup
         */
        jdata = OBJ_NEW(orte_job_t);
        /* create a jobid for these daemons - this is done solely
         * to avoid confusing the rest of the system's bookkeeping
         */
        orte_plm_base_create_jobid(jdata);
        /* flag the job as being debugger daemons */
        jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
        /* unless directed, we do not forward output */
        if (!MPIR_forward_output) {
            jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
        }
        /* add it to the global job pool */
        ljob = ORTE_LOCAL_JOBID(jdata->jobid);
        opal_pointer_array_set_item(orte_job_data, ljob, jdata);
        /* create an app_context for the debugger daemon */
        app = OBJ_NEW(orte_app_context_t);
        if (NULL != orte_debugger_base.test_daemon) {
            app->app = strdup(orte_debugger_base.test_daemon);
        } else {
            app->app = strdup((char*)MPIR_executable_path);
        }

	jdata->state = ORTE_JOB_STATE_INIT;

        opal_argv_append_nosize(&app->argv, app->app);
        build_debugger_args(app);
        opal_pointer_array_add(jdata->apps, app);
        jdata->num_apps = 1;
        /* setup the mapping policy to bynode so we get one
         * daemon on each node
         */
        jdata->map = OBJ_NEW(orte_job_map_t);
        jdata->map->policy = ORTE_MAPPING_BYNODE;
	jdata->map->npernode = 1;
        /* now go ahead and spawn this job */
        if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
            ORTE_ERROR_LOG(rc);
        }
    }
        
 RELEASE:
    /* reset the read or timer event */
    if (0 == orte_debugger_mpirx_check_rate) {
        fifo_active = true;
        opal_event_add(&attach, 0);
    } else if (!MPIR_being_debugged) {
	ORTE_TIMER_EVENT(orte_debugger_mpirx_check_rate, 0, attach_debugger);
    }

    /* notify the debugger that all is ready */
    MPIR_Breakpoint();
}
Ejemplo n.º 14
0
/*
 * If MPICH is built with the --enable-debugger option, MPI_Init and 
 * MPI_Init_thread will call MPIR_WaitForDebugger.  This ensures both that
 * the debugger can gather information on the MPI job before the MPI_Init
 * returns to the user and that the necessary symbols for providing 
 * information such as message queues is available.
 *
 * In addition, the environment variable MPIEXEC_DEBUG, if set, will cause
 * all MPI processes to wait in this routine until the variable 
 * MPIR_debug_gate is set to 1.
 */
void MPIR_WaitForDebugger( void )
{
#ifdef MPIU_PROCTABLE_NEEDED
    int rank = MPIR_Process.comm_world->rank;
#if defined(FINEGRAIN_MPI)
    int size = MPIR_Process.comm_world->num_osprocs;
#else
    int size = MPIR_Process.comm_world->local_size;
#endif
    int i, maxsize;

    /* FIXME: In MPICH, the executables may not have the information
       on the other processes; this is part of the Process Manager Interface
       (PMI).  We need another way to provide this information to 
       a debugger */
    /* The process manager probably has all of this data - the MPI2 
       debugger interface API provides (at least originally) a way 
       to access this. */
    /* Also, to avoid scaling problems, we only populate the first 64
       entries (default) */
    maxsize = MPIR_CVAR_PROCTABLE_SIZE;
    if (maxsize > size) maxsize = size;

    if (rank == 0) {
	char hostname[MPI_MAX_PROCESSOR_NAME+1];
	int  hostlen;
	int  val;

	MPIR_proctable    = (MPIR_PROCDESC *)MPIU_Malloc( 
					 size * sizeof(MPIR_PROCDESC) );
	for (i=0; i<size; i++) {
	    /* Initialize the proctable */
	    MPIR_proctable[i].host_name       = 0;
	    MPIR_proctable[i].executable_name = 0;
	    MPIR_proctable[i].pid             = -1;
	}

	PMPI_Get_processor_name( hostname, &hostlen );
	MPIR_proctable[0].host_name       = (char *)MPIU_Strdup( hostname );
	MPIR_proctable[0].executable_name = 0;
	MPIR_proctable[0].pid             = getpid();

	for (i=1; i<maxsize; i++) {
	    int msg[2];
	    PMPI_Recv( msg, 2, MPI_INT, i, 0, MPI_COMM_WORLD,MPI_STATUS_IGNORE);
	    MPIR_proctable[i].pid = msg[1];
	    MPIR_proctable[i].host_name = (char *)MPIU_Malloc( msg[0] + 1 );
	    PMPI_Recv( MPIR_proctable[i].host_name, msg[0]+1, MPI_CHAR, 
		       i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
	    MPIR_proctable[i].host_name[msg[0]] = 0;
	}

	MPIR_proctable_size               = size;
	/* Debugging hook */
	if (MPIR_CVAR_PROCTABLE_PRINT) {
	    for (i=0; i<maxsize; i++) {
		printf( "PT[%d].pid = %d, .host_name = %s\n", 
			i, MPIR_proctable[i].pid, MPIR_proctable[i].host_name );
	    }
	    fflush( stdout );
	}
	MPIR_Add_finalize( MPIR_FreeProctable, MPIR_proctable, 0 );
    }
    else {
	char hostname[MPI_MAX_PROCESSOR_NAME+1];
	int  hostlen;
	int  mypid = getpid();
	int  msg[2];
	if (rank < maxsize) {
	    PMPI_Get_processor_name( hostname, &hostlen );
	    msg[0] = hostlen;
	    msg[1] = mypid;
	    
	    /* Deliver to the root process the proctable information */
	    PMPI_Ssend( msg, 2, MPI_INT, 0, 0, MPI_COMM_WORLD );
	    PMPI_Ssend( hostname, hostlen, MPI_CHAR, 0, 0, MPI_COMM_WORLD );
	}
    }
#endif /* MPIU_PROCTABLE_NEEDED */

    /* Put the breakpoint after setting up the proctable */
    MPIR_debug_state    = MPIR_DEBUG_SPAWNED;
#ifdef MPIU_BREAKPOINT_NEEDED
    (void)MPIR_Breakpoint();
#endif
    /* After we exit the MPIR_Breakpoint routine, the debugger may have
       set variables such as MPIR_being_debugged */

    /* Initialize the sendq support */
    SendqInit();

    if (getenv("MPIEXEC_DEBUG")) {
	while (!MPIR_debug_gate) ; 
    }

    
}
Ejemplo n.º 15
0
static int
__upc_monitor_threads (void)
{
  upc_info_p u = __upc_info;
  pid_t pid;
  int wait_status;
  int exit_status;
  int thread_id;
  int global_exit_invoked;
  struct sigaction action;
  exit_status = -1;
  global_exit_invoked = 0;
  /* Install SIGTERM handler responsible for
     terminating the whole program.  */
  action.sa_handler = __upc_sigterm_handler;
  sigemptyset (&action.sa_mask);
  action.sa_flags = 0;
  sigaction (SIGTERM, &action, NULL);
  /* Wait for threads to finish.  */
  for (;;)
    {
      pid = waitpid (-1, &wait_status, WNOHANG);
      /* Check for errors.  */
      if (pid == -1)
	{
	  /* Continue checking if interrupted
	     (handling other signals).  */
	  if (errno == EINTR)
	    continue;
	  /* Stop waiting if no more children.  */
	  if (errno == ECHILD)
	    break;
	  /* Abort if invalid argument.  */
	  if (errno == EINVAL)
	    {
	      perror ("waitpid");
	      abort ();
	    }
	}
      /* Not a child exit?  */
      if (pid == 0)
	{
	  /* Check for debugger attach.  */
	  MPIR_Breakpoint ();
	  /* Release the CPU for 100mS and continue checking.  */
	  usleep (100000);
	  continue;
	}
      /* Check for child process that exited.  */
      thread_id = __upc_get_thread_id (pid);
      if (!global_exit_invoked && WIFEXITED (wait_status))
	{
	  int child_exit = WEXITSTATUS (wait_status);
	  if (child_exit & 0x80)
	    {
	      /* By convention, the result of a call to upc_global_exit
	         has the high bit in the byte set.
	         Terminate all the other threads in the program. */
	      int t;
	      for (t = 0; t < THREADS; ++t)
		{
		  int pid = u->thread_info[t].pid;
		  if (pid <= 0)
		    abort ();
		  if (t != thread_id)
		    (void) kill (pid, SIGKILL);
		}
	      child_exit &= 0x7f;
	      global_exit_invoked = 1;
	    }
	  else if ((exit_status != -1) && exit_status != child_exit)
	    {
	      fprintf (stderr, "conflicting exit status (%d) for"
		       " thread %d\n", child_exit, thread_id);
	    }
	  exit_status = child_exit;
	}
      else if (WIFSIGNALED (wait_status))
	{
	  int child_sig = WTERMSIG (wait_status);
	  /* Ignore SIGKILL signals.
	     We use them to implement upc_global_exit(). */
	  if (child_sig == SIGKILL && global_exit_invoked)
	    continue;
	  fprintf (stderr, "thread %d terminated with signal: '%s'\n",
		   thread_id, __upc_strsignal (child_sig));
          /*  GASP note: We can't record a noncollective GASP
              exit event here, because the process has already died.  */
	  /* We'll all go away now. */
	  if (killpg (getpid (), SIGTERM) == -1)
	    {
	      perror ("killpg");
	      exit (-1);
	    }
	}
    }
  return exit_status;
}
Ejemplo n.º 16
0
/* Implement UPC threads as processes. */
static void
__upc_run_threads (upc_info_p u, int argc, char *argv[])
{
  int thread_id;
  int flag;

  /* Set O_APPEND on stdout and stderr (see Berkeley UPC bug 2136). */
  flag = fcntl (STDOUT_FILENO, F_GETFL, 0);
  if (flag >= 0)
    (void) fcntl (STDOUT_FILENO, F_SETFL, flag | O_APPEND);
  flag = fcntl (STDERR_FILENO, F_GETFL, 0);
  if (flag >= 0)
    (void) fcntl (STDERR_FILENO, F_SETFL, flag | O_APPEND);

  if (THREADS == 1)
    {
#if GUPCR_HAVE_OMP_CHECKS
      __upc_omp_master_id = pthread_self ();
#endif
      __upc_affinity_set (u, 0);
      __upc_run_this_thread (u, argc, argv, 0);
      /* Shouldn't get here.  */
      abort ();
    }

  /* In case a debugger is using the value;
     we don't want it to see two thread zeros */
  MYTHREAD = -1;
  /* Allocate space to tell the debugger about
     the process we're creating */
  MPIR_proctable = malloc (THREADS * sizeof (*MPIR_proctable));
  /* Tell the debugger this process is a starter process.  */
  MPIR_i_am_starter ();
  for (thread_id = 0; thread_id < THREADS; ++thread_id)
    {
      pid_t pid = fork ();
      if (pid == 0)
	{
	  /* child */
#if GUPCR_HAVE_OMP_CHECKS
          __upc_omp_master_id = pthread_self ();
#endif
	  __upc_affinity_set (u, thread_id);
	  __upc_run_this_thread (u, argc, argv, thread_id);
	}
      else if (pid > 0)
	{
	  /* parent */
	  u->thread_info[thread_id].pid = pid;
	  if (MPIR_being_debugged)
	    {
	      MPIR_proctable[thread_id].host_name = u->host_name;
	      MPIR_proctable[thread_id].executable_name = u->program_name;
	      MPIR_proctable[thread_id].pid = pid;
	    }
	}
      else
	{
	  /* error */
	  perror ("fork");
	  exit (2);
	}
    }
  /* We're the main process, there are child processes and they're all started.
   * Let the debugger know about that.
   */
  if (MPIR_being_debugged)
    {
      MPIR_proctable_size = THREADS;
      MPIR_debug_state = MPIR_DEBUG_SPAWNED;
      /* The debugger will have set a breakpoint there... */
      MPIR_Breakpoint ();
      /* Release threads.  */
      u->partial_attach_start = 1;
    }
  if (unlink (u->mmap_file_name) < 0)
    {
      perror ("cannot unlink global shared memory file");
      abort ();
    }
}