/** * Initialization of data structures for running under a debugger * using the MPICH/TotalView parallel debugger interface. Before the * spawn we need to check if we are being run under a TotalView-like * debugger; if so then inform applications via an MCA parameter. */ void orte_debugger_init_before_spawn(orte_job_t *jdata) { char *env_name; orte_app_context_t **apps, *app; orte_std_cntr_t i; int32_t ljob; if (!MPIR_being_debugged && !orte_in_parallel_debugger) { /* not being debugged - check if we want to enable * later attachment by debugger */ if (orte_enable_debug_cospawn_while_running) { /* setup a timer to wake us up periodically * to check for debugger attach */ ORTE_TIMER_EVENT(orte_debugger_check_rate, 0, check_debugger); } return; } if (orte_debug_flag) { opal_output(0, "Info: Spawned by a debugger"); } /* tell the procs they are being debugged */ apps = (orte_app_context_t**)jdata->apps->addr; env_name = mca_base_param_environ_variable("orte", "in_parallel_debugger", NULL); for (i=0; i < jdata->num_apps; i++) { opal_setenv(env_name, "1", true, &apps[i]->env); } free(env_name); /* check if we need to co-spawn the debugger daemons */ if ('\0' != MPIR_executable_path[0]) { /* add debugger info to launch message */ orte_debugger_daemon = OBJ_NEW(orte_job_t); /* create a jobid for these daemons - this is done solely * to avoid confusing the rest of the system's bookkeeping */ orte_plm_base_create_jobid(orte_debugger_daemon); /* flag the job as being debugger daemons */ orte_debugger_daemon->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON; /* unless directed, we do not forward output */ if (!MPIR_forward_output) { orte_debugger_daemon->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT; } /* add it to the global job pool */ ljob = ORTE_LOCAL_JOBID(orte_debugger_daemon->jobid); opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon); /* create an app_context for the debugger daemon */ app = OBJ_NEW(orte_app_context_t); app->app = strdup((char*)MPIR_executable_path); opal_argv_append_nosize(&app->argv, app->app); build_debugger_args(app); opal_pointer_array_add(orte_debugger_daemon->apps, &app->super); orte_debugger_daemon->num_apps = 1; } }
static void attach_debugger(int fd, short event, void *arg) { orte_app_context_t *app; unsigned char fifo_cmd; int rc; int32_t ljob; orte_job_t *jdata; /* read the file descriptor to clear that event, if necessary */ if (fifo_active) { opal_event_del(&attach); fifo_active = false; rc = read(attach_fd, &fifo_cmd, sizeof(fifo_cmd)); if (!rc) { /* reopen device to clear hangup */ open_fifo(); return; } if (1 != fifo_cmd) { /* ignore the cmd */ goto RELEASE; } } if (!MPIR_being_debugged && !orte_debugger_base.test_attach) { /* false alarm */ goto RELEASE; } opal_output_verbose(1, orte_debugger_base.output, "%s Attaching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_debugger_base.test_daemon) ? MPIR_executable_path : orte_debugger_base.test_daemon); /* a debugger has attached! All the MPIR_Proctable * data is already available, so we only need to * check to see if we should spawn any daemons */ if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_base.test_daemon) { /* can only have one debugger */ if (NULL != orte_debugger_daemon) { opal_output(0, "-------------------------------------------\n" "Only one debugger can be used on a job.\n" "-------------------------------------------\n"); goto RELEASE; } opal_output_verbose(2, orte_debugger_base.output, "%s Spawning debugger daemons %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_debugger_base.test_daemon) ? MPIR_executable_path : orte_debugger_base.test_daemon); /* this will be launched just like a regular job, * so we do not use the global orte_debugger_daemon * as this is reserved for co-location upon startup */ jdata = OBJ_NEW(orte_job_t); /* create a jobid for these daemons - this is done solely * to avoid confusing the rest of the system's bookkeeping */ orte_plm_base_create_jobid(jdata); /* flag the job as being debugger daemons */ jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON; /* unless directed, we do not forward output */ if (!MPIR_forward_output) { jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT; } /* add it to the global job pool */ ljob = ORTE_LOCAL_JOBID(jdata->jobid); opal_pointer_array_set_item(orte_job_data, ljob, jdata); /* create an app_context for the debugger daemon */ app = OBJ_NEW(orte_app_context_t); if (NULL != orte_debugger_base.test_daemon) { app->app = strdup(orte_debugger_base.test_daemon); } else { app->app = strdup((char*)MPIR_executable_path); } jdata->state = ORTE_JOB_STATE_INIT; opal_argv_append_nosize(&app->argv, app->app); build_debugger_args(app); opal_pointer_array_add(jdata->apps, app); jdata->num_apps = 1; /* setup the mapping policy to bynode so we get one * daemon on each node */ jdata->map = OBJ_NEW(orte_job_map_t); jdata->map->policy = ORTE_MAPPING_BYNODE; jdata->map->npernode = 1; /* now go ahead and spawn this job */ if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { ORTE_ERROR_LOG(rc); } } RELEASE: /* reset the read or timer event */ if (0 == orte_debugger_mpirx_check_rate) { fifo_active = true; opal_event_add(&attach, 0); } else if (!MPIR_being_debugged) { ORTE_TIMER_EVENT(orte_debugger_mpirx_check_rate, 0, attach_debugger); } /* notify the debugger that all is ready */ MPIR_Breakpoint(); }
static void check_debugger(int fd, short event, void *arg) { struct timeval now; opal_event_t *tmp = (opal_event_t*)arg; orte_job_t *jdata; orte_app_context_t *app; char cwd[OPAL_PATH_MAX]; int rc; int32_t ljob; if (MPIR_being_debugged) { if (orte_debug_flag) { opal_output(0, "%s Launching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), MPIR_executable_path); } /* a debugger has attached! All the MPIR_Proctable * data is already available, so we only need to * check to see if we should spawn any daemons */ if ('\0' != MPIR_executable_path[0]) { /* this will be launched just like a regular job, * so we do not use the global orte_debugger_daemon * as this is reserved for co-location upon startup */ jdata = OBJ_NEW(orte_job_t); /* create a jobid for these daemons - this is done solely * to avoid confusing the rest of the system's bookkeeping */ orte_plm_base_create_jobid(jdata); /* flag the job as being debugger daemons */ jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON; /* unless directed, we do not forward output */ if (!MPIR_forward_output) { jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT; } /* set the mapping policy to "pernode" so we only get * one debugger daemon on each node */ jdata->map = OBJ_NEW(orte_job_map_t); jdata->map->npernode = 1; /* add it to the global job pool */ ljob = ORTE_LOCAL_JOBID(jdata->jobid); opal_pointer_array_set_item(orte_job_data, ljob, jdata); /* create an app_context for the debugger daemon */ app = OBJ_NEW(orte_app_context_t); app->app = strdup((char*)MPIR_executable_path); if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) { orte_show_help("help-orterun.txt", "orterun:init-failure", true, "get the cwd", rc); OBJ_RELEASE(jdata); goto RELEASE; } app->cwd = strdup(cwd); app->user_specified_cwd = false; opal_argv_append_nosize(&app->argv, app->app); build_debugger_args(app); opal_pointer_array_add(jdata->apps, &app->super); jdata->num_apps = 1; /* now go ahead and spawn this job */ if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { ORTE_ERROR_LOG(rc); } } RELEASE: /* notify the debugger that all is ready */ MPIR_Breakpoint(); } else { /* reissue the timer to wake us up again */ now.tv_sec = orte_debugger_check_rate; now.tv_usec = 0; opal_evtimer_add(tmp, &now); } }
/** * Initialization of data structures for running under a debugger * using an extended MPICH/TotalView parallel debugger interface. Before the * spawn we need to check if we are being run under a TotalView-like * debugger; if so then inform applications via an MCA parameter. */ void init_before_spawn(orte_job_t *jdata) { char *env_name; orte_app_context_t *app; int i; int32_t ljob; char *attach_fifo; if (!MPIR_being_debugged && !orte_in_parallel_debugger) { /* if we were given a test debugger, then we still want to * colaunch it */ if (NULL != orte_debugger_base.test_daemon) { opal_output_verbose(2, orte_debugger_base.output, "%s No debugger test daemon specified", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); goto launchit; } /* if we were given an auto-detect rate, then we want to setup * an event so we periodically do the check */ if (0 < orte_debugger_mpirx_check_rate) { opal_output_verbose(2, orte_debugger_base.output, "%s Setting debugger attach check rate for %d seconds", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_debugger_mpirx_check_rate); ORTE_TIMER_EVENT(orte_debugger_mpirx_check_rate, 0, attach_debugger); } else { /* create the attachment FIFO and put it into MPIR, setup readevent */ /* create a FIFO name in the session dir */ attach_fifo = opal_os_path(false, orte_process_info.job_session_dir, "debugger_attach_fifo", NULL); if ((mkfifo(attach_fifo, FILE_MODE) < 0) && errno != EEXIST) { opal_output(0, "CANNOT CREATE FIFO %s: errno %d", attach_fifo, errno); free(attach_fifo); return; } strncpy(MPIR_attach_fifo, attach_fifo, MPIR_MAX_PATH_LENGTH - 1); free (attach_fifo); open_fifo (); } return; } launchit: opal_output_verbose(2, orte_debugger_base.output, "%s: Spawned by a debugger", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* tell the procs they are being debugged */ env_name = mca_base_param_environ_variable("orte", "in_parallel_debugger", NULL); for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } opal_setenv(env_name, "1", true, &app->env); } free(env_name); /* check if we need to co-spawn the debugger daemons */ if ('\0' != MPIR_executable_path[0] || NULL != orte_debugger_base.test_daemon) { /* can only have one debugger */ if (NULL != orte_debugger_daemon) { opal_output(0, "-------------------------------------------\n" "Only one debugger can be used on a job.\n" "-------------------------------------------\n"); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); return; } opal_output_verbose(2, orte_debugger_base.output, "%s Cospawning debugger daemons %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_debugger_base.test_daemon) ? MPIR_executable_path : orte_debugger_base.test_daemon); /* add debugger info to launch message */ orte_debugger_daemon = OBJ_NEW(orte_job_t); /* create a jobid for these daemons - this is done solely * to avoid confusing the rest of the system's bookkeeping */ orte_plm_base_create_jobid(orte_debugger_daemon); /* flag the job as being debugger daemons */ orte_debugger_daemon->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON; /* unless directed, we do not forward output */ if (!MPIR_forward_output) { orte_debugger_daemon->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT; } /* add it to the global job pool */ ljob = ORTE_LOCAL_JOBID(orte_debugger_daemon->jobid); opal_pointer_array_set_item(orte_job_data, ljob, orte_debugger_daemon); /* create an app_context for the debugger daemon */ app = OBJ_NEW(orte_app_context_t); if (NULL != orte_debugger_base.test_daemon) { app->app = strdup(orte_debugger_base.test_daemon); } else { app->app = strdup((char*)MPIR_executable_path); } opal_argv_append_nosize(&app->argv, app->app); build_debugger_args(app); opal_pointer_array_add(orte_debugger_daemon->apps, app); orte_debugger_daemon->num_apps = 1; } return; }