int main (int argc, char **argv) { char *input_buffer; char *input_line; char *procprog; char *proc_cgroup_hierarchy; pid_t mypid = 0; pid_t myppid = 0; struct stat statbuf; dev_t mydev = 0; ino_t myino = 0; int procuid = 0; pid_t procpid = 0; pid_t procppid = 0; pid_t kthread_ppid = 0; int procvsz = 0; int procrss = 0; int procseconds = 0; float procpcpu = 0; char procstat[8]; char procetime[MAX_INPUT_BUFFER] = { '\0' }; char *procargs; char *tmp; const char *zombie = "Z"; int resultsum = 0; /* bitmask of the filter criteria met by a process */ int found = 0; /* counter for number of lines returned in `ps` output */ int procs = 0; /* counter for number of processes meeting filter criteria */ int pos; /* number of spaces before 'args' in `ps` output */ int cols; /* number of columns in ps output */ int expected_cols = PS_COLS - 1; int warn = 0; /* number of processes in warn state */ int crit = 0; /* number of processes in crit state */ int i = 0, j = 0; int result = STATE_UNKNOWN; int ret = 0; output chld_out, chld_err; setlocale (LC_ALL, ""); bindtextdomain (PACKAGE, LOCALEDIR); textdomain (PACKAGE); setlocale(LC_NUMERIC, "POSIX"); input_buffer = malloc (MAX_INPUT_BUFFER); procprog = malloc (MAX_INPUT_BUFFER); proc_cgroup_hierarchy = malloc (MAX_INPUT_BUFFER); xasprintf (&metric_name, "PROCS"); metric = METRIC_PROCS; /* Parse extra opts if any */ argv=np_extra_opts (&argc, argv, progname); if (process_arguments (argc, argv) == ERROR) usage4 (_("Could not parse arguments")); /* find ourself */ mypid = getpid(); myppid = getppid(); if (usepid || stat_exe(mypid, &statbuf) == -1) { /* usepid might have been set by -T */ usepid = 1; } else { usepid = 0; mydev = statbuf.st_dev; myino = statbuf.st_ino; } /* Set signal handling and alarm timeout */ if (signal (SIGALRM, timeout_alarm_handler) == SIG_ERR) { die (STATE_UNKNOWN, _("Cannot catch SIGALRM")); } (void) alarm ((unsigned) timeout_interval); if (verbose >= 2) printf (_("CMD: %s\n"), PS_COMMAND); if (input_filename == NULL) { result = cmd_run( PS_COMMAND, &chld_out, &chld_err, 0); if (chld_err.lines > 0) { printf ("%s: %s", _("System call sent warnings to stderr"), chld_err.line[0]); exit(STATE_WARNING); } } else { result = cmd_file_read( input_filename, &chld_out, 0); } /* flush first line: j starts at 1 */ for (j = 1; j < chld_out.lines; j++) { input_line = chld_out.line[j]; if (verbose >= 3) printf ("%s", input_line); strcpy (procprog, ""); strcpy (proc_cgroup_hierarchy, ""); xasprintf (&procargs, "%s", ""); cols = sscanf (input_line, PS_FORMAT, PS_VARLIST); /* Zombie processes do not give a procprog command */ if ( cols < expected_cols && strstr(procstat, zombie) ) { cols = expected_cols; } if ( cols >= expected_cols ) { resultsum = 0; xasprintf (&procargs, "%s", input_line + pos); strip (procargs); /* Some ps return full pathname for command. This removes path */ strcpy(procprog, base_name(procprog)); /* we need to convert the elapsed time to seconds */ procseconds = convert_to_seconds(procetime); if (verbose >= 3) { printf ("proc#=%d uid=%d vsz=%d rss=%d pid=%d ppid=%d pcpu=%.2f stat=%s etime=%s prog=%s args=%s", procs, procuid, procvsz, procrss, procpid, procppid, procpcpu, procstat, procetime, procprog, procargs); if (strstr(PS_COMMAND, "cgroup") != NULL) { printf(" proc_cgroup_hierarchy=%s\n", proc_cgroup_hierarchy); } else { printf("\n"); } } /* Ignore self */ if ((usepid && mypid == procpid) || (!usepid && ((ret = stat_exe(procpid, &statbuf) != -1) && statbuf.st_dev == mydev && statbuf.st_ino == myino) || (ret == -1 && errno == ENOENT))) { if (verbose >= 3) printf("not considering - is myself or gone\n"); continue; } /* Ignore parent*/ else if (myppid == procpid) { if (verbose >= 3) printf("not considering - is parent\n"); continue; } /* filter kernel threads (childs of KTHREAD_PARENT)*/ /* TODO adapt for other OSes than GNU/Linux sorry for not doing that, but I've no other OSes to test :-( */ if (kthread_filter == 1) { /* get pid KTHREAD_PARENT */ if (kthread_ppid == 0 && !strcmp(procprog, KTHREAD_PARENT) ) kthread_ppid = procpid; if (kthread_ppid == procppid) { if (verbose >= 2) printf ("Ignore kernel thread: pid=%d ppid=%d prog=%s args=%s\n", procpid, procppid, procprog, procargs); continue; } } if ((options & STAT) && (strstr (statopts, procstat))) resultsum |= STAT; if ((options & ARGS) && procargs && (strstr (procargs, args) != NULL)) resultsum |= ARGS; if ((options & EREG_ARGS) && procargs && (regexec(&re_args, procargs, (size_t) 0, NULL, 0) == 0)) resultsum |= EREG_ARGS; if ((options & PROG) && procprog && (strcmp (prog, procprog) == 0)) resultsum |= PROG; if ((options & PPID) && (procppid == ppid)) resultsum |= PPID; if ((options & USER) && (procuid == uid)) resultsum |= USER; if ((options & VSZ) && (procvsz >= vsz)) resultsum |= VSZ; if ((options & RSS) && (procrss >= rss)) resultsum |= RSS; if ((options & PCPU) && (procpcpu >= pcpu)) resultsum |= PCPU; if (options & CGROUP_HIERARCHY) { if(!strncmp(proc_cgroup_hierarchy,"-", 2) && !strncmp(cgroup_hierarchy,"/", 2)) { resultsum |= CGROUP_HIERARCHY; } else { if((tmp = strstr(proc_cgroup_hierarchy,":/")) != NULL) { if(!strcmp(tmp+1,cgroup_hierarchy)) { resultsum |= CGROUP_HIERARCHY; }; }; }; }; found++; /* Next line if filters not matched */ if (!(options == resultsum || options == ALL)) continue; procs++; if (verbose >= 2) { printf ("Matched: uid=%d vsz=%d rss=%d pid=%d ppid=%d pcpu=%.2f stat=%s etime=%s prog=%s args=%s", procuid, procvsz, procrss, procpid, procppid, procpcpu, procstat, procetime, procprog, procargs); if (strstr(PS_COMMAND, "cgroup") != NULL) { printf(" cgroup_hierarchy=%s\n", cgroup_hierarchy); } else { printf("\n"); } } if (metric == METRIC_VSZ) i = get_status ((double)procvsz, procs_thresholds); else if (metric == METRIC_RSS) i = get_status ((double)procrss, procs_thresholds); /* TODO? float thresholds for --metric=CPU */ else if (metric == METRIC_CPU) i = get_status (procpcpu, procs_thresholds); else if (metric == METRIC_ELAPSED) i = get_status ((double)procseconds, procs_thresholds); if (metric != METRIC_PROCS) { if (i == STATE_WARNING) { warn++; xasprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), procprog); result = max_state (result, i); } if (i == STATE_CRITICAL) { crit++; xasprintf (&fails, "%s%s%s", fails, (strcmp(fails,"") ? ", " : ""), procprog); result = max_state (result, i); } } } /* This should not happen */ else if (verbose) { printf(_("Not parseable: %s"), input_buffer); } } if (found == 0) { /* no process lines parsed so return STATE_UNKNOWN */ printf (_("Unable to read output\n")); return STATE_UNKNOWN; } if ( result == STATE_UNKNOWN ) result = STATE_OK; /* Needed if procs found, but none match filter */ if ( metric == METRIC_PROCS ) { result = max_state (result, get_status ((double)procs, procs_thresholds) ); } if ( result == STATE_OK ) { printf ("%s %s: ", metric_name, _("OK")); } else if (result == STATE_WARNING) { printf ("%s %s: ", metric_name, _("WARNING")); if ( metric != METRIC_PROCS ) { printf (_("%d warn out of "), warn); } } else if (result == STATE_CRITICAL) { printf ("%s %s: ", metric_name, _("CRITICAL")); if (metric != METRIC_PROCS) { printf (_("%d crit, %d warn out of "), crit, warn); } } printf (ngettext ("%d process", "%d processes", (unsigned long) procs), procs); if (strcmp(fmt,"") != 0) { printf (_(" with %s"), fmt); } if ( verbose >= 1 && strcmp(fails,"") ) printf (" [%s]", fails); if (metric == METRIC_PROCS) printf (" | procs=%d;%s;%s;0;", procs, warning_range ? warning_range : "", critical_range ? critical_range : ""); else printf (" | procs=%d;;;0; procs_warn=%d;;;0; procs_crit=%d;;;0;", procs, warn, crit); printf ("\n"); return result; }
/* * Overall executable distribution using fast. */ int distribute_executable(void) { int ret = 1; /* failure */ #if HAVE_FAST_DIST const char *fast_command = FAST_DIST_PATH; /* from configure */ int i; int numtasks_save; int local_numtasks; tasks_t *tasks_save; cl_args_t cl_args_save; config_spec_t cs, root_cs; growstr_t *g, *root_g; int temp_fd; char *file_template; int port_num; FILE *fp; const char *exec_to_dist; int *usenodes; exec_to_dist = config_get_unique_executable(); if (!exec_to_dist) return ret; if (!stat_exe(fast_command, 0)) return ret; /* analyze nodes */ usenodes = Malloc(numnodes * sizeof(*usenodes)); memset(usenodes, 0, numnodes * sizeof(*usenodes)); local_numtasks = 0; for (i=0; i<numtasks; i++) { if (!usenodes[tasks[i].node]) { usenodes[tasks[i].node] = 1; ++local_numtasks; } } /* don't bother if there is only one node */ if (local_numtasks <= 1) { free(usenodes); return ret; } /* create temporary node file */ file_template = strsave("/tmp/mpiexec-fast-XXXXXX"); temp_fd = mkstemp(file_template); if (!temp_fd) goto out; debug(1, "%s: temp node list file is %d",__func__, temp_fd); fp = fdopen(temp_fd, "w"); if (!fp) goto out; /* add nodes to the node file */ for (i=0; i<numnodes; i++) { if (!usenodes[i]) continue; if (fprintf(fp, "%s\n", nodes[i].name) <= 0) { fclose(fp); goto out; } } if (fclose(fp) != 0) goto out; /* pick a random port number between 6 and 8 thousand */ srand(time(NULL)); port_num = rand() % 2000 + 6000; /* * Back up the tasks structure and number of tasks as well as command * line args. */ tasks_save = tasks; numtasks_save = numtasks; memcpy(&cl_args_save, cl_args, sizeof(*cl_args)); /* set the fast_dist executable name */ cs.exe = fast_command; root_cs.exe = cs.exe; /* set up the args to pass to the non-root nodes */ g = growstr_init(); growstr_printf(g, "-p %d", port_num); cs.args = g->s; debug(1, "%s: arg string for non root: %s", __func__, g->s); /* and to the root node */ root_g = growstr_init(); growstr_printf(root_g, "-p %d -r %s -e %s -n %s", port_num, nodes[tasks[0].node].name, exec_to_dist, file_template); root_cs.args = root_g->s; debug(1, "%s: arg string for root: %s", __func__, root_g->s); /* build new tasks */ cl_args->which_stdin = STDIN_NONE; cl_args->comm = COMM_NONE; tasks = Malloc(local_numtasks * sizeof(*tasks)); numtasks = local_numtasks; for (i=0; i < numtasks; i++) { tasks[i].num_copies = 1; tasks[i].done = DONE_NOT_STARTED; *tasks[i].status = -1; /* * Slight race condition in that the root wants to actively connect * to some other nodes, but it will retry a bit. Put root last to * hope that there is a bit of delay in startup. */ if (i == numtasks - 1) { tasks[i].node = tasks_save[0].node; tasks[i].conf = &root_cs; } else { tasks[i].node = tasks_save[i+1].node; tasks[i].conf = &cs; } debug(1, "%s: task %d on %d", __func__, i, tasks[i].node); } /* spawn tasks */ start_tasks(0); debug(1, "%s: tasks started", __func__); /* wait for them to exit */ wait_tasks(); /* make sure everyone finished successfully */ ret = 0; for (i=0; i<numtasks; i++) { if (tasks[i].done == DONE_NO_EXIT_STATUS) continue; if (*tasks[i].status != 0) { ret = 1; break; } } debug(1, "%s: done, ret = %d", __func__, ret); /* put back original tasks structures */ free(tasks); tasks = tasks_save; numtasks = numtasks_save; memcpy(cl_args, &cl_args_save, sizeof(*cl_args)); growstr_free(g); growstr_free(root_g); /* * Update executable in old config structure to point to new /tmp exec, * using the same algorithm as fast_dist. It is not deleted upon * completion but relies on $TMPDIR being deleted when PBS cleans up the * job or normal /tmp cleaning. */ if (ret == 0) { const char *cp, *base; growstr_t *h; h = growstr_init(); cp = getenv("TMPDIR"); if (!cp || !*cp) cp = "/tmp"; growstr_append(h, cp); for (cp=base=exec_to_dist; *cp; cp++) if (*cp == '/') base = cp+1; growstr_append(h, "/"); growstr_append(h, base); config_set_unique_executable(strsave(h->s)); growstr_free(h); } out: unlink(file_template); free(file_template); free(usenodes); #endif /* HAVE_FAST_DIST */ return ret; }