示例#1
0
文件: acme.c 项目: Nurb432/plan9front
int
shutdown(void*, char *msg)
{
	int i;

	killprocs();
	if(!dumping && strcmp(msg, "kill")!=0 && strcmp(msg, "exit")!=0 && getpid()==mainpid){
		dumping = TRUE;
		rowdump(&row, nil);
	}
	for(i=0; oknotes[i]; i++)
		if(strncmp(oknotes[i], msg, strlen(oknotes[i])) == 0)
			threadexitsall(msg);
	print("acme: %s\n", msg);
	abort();
	return 0;
}
示例#2
0
文件: acme.c 项目: UNGLinux/9base
void
threadmain(int argc, char *argv[])
{
	int i;
	char *p, *loadfile;
	Column *c;
	int ncol;
	Display *d;

	rfork(RFENVG|RFNAMEG);

	ncol = -1;

	loadfile = nil;
	ARGBEGIN{
	case 'D':
		{extern int _threaddebuglevel;
		_threaddebuglevel = ~0;
		}
		break;
	case 'a':
		globalautoindent = TRUE;
		break;
	case 'b':
		bartflag = TRUE;
		break;
	case 'c':
		p = ARGF();
		if(p == nil)
			goto Usage;
		ncol = atoi(p);
		if(ncol <= 0)
			goto Usage;
		break;
	case 'f':
		fontnames[0] = ARGF();
		if(fontnames[0] == nil)
			goto Usage;
		break;
	case 'F':
		fontnames[1] = ARGF();
		if(fontnames[1] == nil)
			goto Usage;
		break;
	case 'l':
		loadfile = ARGF();
		if(loadfile == nil)
			goto Usage;
		break;
	case 'm':
		mtpt = ARGF();
		if(mtpt == nil)
			goto Usage;
		break;
	case 'r':
		swapscrollbuttons = TRUE;
		break;
	case 'W':
		winsize = ARGF();
		if(winsize == nil)
			goto Usage;
		break;
	default:
	Usage:
		fprint(2, "usage: acme -a -c ncol -f fontname -F fixedwidthfontname -l loadfile -W winsize\n");
		threadexitsall("usage");
	}ARGEND

	fontnames[0] = estrdup(fontnames[0]);
	fontnames[1] = estrdup(fontnames[1]);

	quotefmtinstall();
	fmtinstall('t', timefmt);

	cputype = getenv("cputype");
	objtype = getenv("objtype");
	home = getenv("HOME");
	acmeshell = getenv("acmeshell");
	if(acmeshell && *acmeshell == '\0')
		acmeshell = nil;
	p = getenv("tabstop");
	if(p != nil){
		maxtab = strtoul(p, nil, 0);
		free(p);
	}
	if(maxtab == 0)
		maxtab = 4;
	if(loadfile)
		rowloadfonts(loadfile);
	putenv("font", fontnames[0]);
	snarffd = open("/dev/snarf", OREAD|OCEXEC);
/*
	if(cputype){
		sprint(buf, "/acme/bin/%s", cputype);
		bind(buf, "/bin", MBEFORE);
	}
	bind("/acme/bin", "/bin", MBEFORE);
*/
	getwd(wdir, sizeof wdir);

/*
	if(geninitdraw(nil, derror, fontnames[0], "acme", nil, Refnone) < 0){
		fprint(2, "acme: can't open display: %r\n");
		threadexitsall("geninitdraw");
	}
*/
	if(initdraw(derror, fontnames[0], "acme") < 0){
		fprint(2, "acme: can't open display: %r\n");
		threadexitsall("initdraw");
	}

	d = display;
	font = d->defaultfont;
/*assert(font); */

	reffont.f = font;
	reffonts[0] = &reffont;
	incref(&reffont.ref);	/* one to hold up 'font' variable */
	incref(&reffont.ref);	/* one to hold up reffonts[0] */
	fontcache = emalloc(sizeof(Reffont*));
	nfontcache = 1;
	fontcache[0] = &reffont;

	iconinit();
	timerinit();
	rxinit();

	cwait = threadwaitchan();
	ccommand = chancreate(sizeof(Command**), 0);
	ckill = chancreate(sizeof(Rune*), 0);
	cxfidalloc = chancreate(sizeof(Xfid*), 0);
	cxfidfree = chancreate(sizeof(Xfid*), 0);
	cnewwindow = chancreate(sizeof(Channel*), 0);
	cerr = chancreate(sizeof(char*), 0);
	cedit = chancreate(sizeof(int), 0);
	cexit = chancreate(sizeof(int), 0);
	cwarn = chancreate(sizeof(void*), 1);
	if(cwait==nil || ccommand==nil || ckill==nil || cxfidalloc==nil || cxfidfree==nil || cerr==nil || cexit==nil || cwarn==nil){
		fprint(2, "acme: can't create initial channels: %r\n");
		threadexitsall("channels");
	}
	chansetname(ccommand, "ccommand");
	chansetname(ckill, "ckill");
	chansetname(cxfidalloc, "cxfidalloc");
	chansetname(cxfidfree, "cxfidfree");
	chansetname(cnewwindow, "cnewwindow");
	chansetname(cerr, "cerr");
	chansetname(cedit, "cedit");
	chansetname(cexit, "cexit");
	chansetname(cwarn, "cwarn");

	mousectl = initmouse(nil, screen);
	if(mousectl == nil){
		fprint(2, "acme: can't initialize mouse: %r\n");
		threadexitsall("mouse");
	}
	mouse = &mousectl->m;
	keyboardctl = initkeyboard(nil);
	if(keyboardctl == nil){
		fprint(2, "acme: can't initialize keyboard: %r\n");
		threadexitsall("keyboard");
	}
	mainpid = getpid();
	startplumbing();
/*
	plumbeditfd = plumbopen("edit", OREAD|OCEXEC);
	if(plumbeditfd < 0)
		fprint(2, "acme: can't initialize plumber: %r\n");
	else{
		cplumb = chancreate(sizeof(Plumbmsg*), 0);
		threadcreate(plumbproc, nil, STACK);
	}
	plumbsendfd = plumbopen("send", OWRITE|OCEXEC);
*/

	fsysinit();

	#define	WPERCOL	8
	disk = diskinit();
	if(!loadfile || !rowload(&row, loadfile, TRUE)){
		rowinit(&row, screen->clipr);
		if(ncol < 0){
			if(argc == 0)
				ncol = 2;
			else{
				ncol = (argc+(WPERCOL-1))/WPERCOL;
				if(ncol < 2)
					ncol = 2;
			}
		}
		if(ncol == 0)
			ncol = 2;
		for(i=0; i<ncol; i++){
			c = rowadd(&row, nil, -1);
			if(c==nil && i==0)
				error("initializing columns");
		}
		c = row.col[row.ncol-1];
		if(argc == 0)
			readfile(c, wdir);
		else
			for(i=0; i<argc; i++){
				p = utfrrune(argv[i], '/');
				if((p!=nil && strcmp(p, "/guide")==0) || i/WPERCOL>=row.ncol)
					readfile(c, argv[i]);
				else
					readfile(row.col[i/WPERCOL], argv[i]);
			}
	}
	flushimage(display, 1);

	acmeerrorinit();
	threadcreate(keyboardthread, nil, STACK);
	threadcreate(mousethread, nil, STACK);
	threadcreate(waitthread, nil, STACK);
	threadcreate(xfidallocthread, nil, STACK);
	threadcreate(newwindowthread, nil, STACK);
/*	threadcreate(shutdownthread, nil, STACK); */
	threadnotify(shutdown, 1);
	recvul(cexit);
	killprocs();
	threadexitsall(nil);
}
示例#3
0
static void proc_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;

    orte_proc_t *child, *ptr;
    opal_buffer_t *alert;
    orte_plm_cmd_flag_t cmd;
    int rc=ORTE_SUCCESS;
    int i;

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:default_orted:proc_errors process %s error state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors finalizing - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    /* if this is a heartbeat failure, let the HNP handle it */
    if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors heartbeat failed - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    /* if this was a failed comm, then see if it was to our
     * lifeline
     */
    if (ORTE_PROC_STATE_LIFELINE_LOST == state ||
        ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:orted lifeline lost - exiting",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        /* set our exit status */
        ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
        /* kill our children */
        killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
        /* terminate - our routed children will see
         * us leave and automatically die
         */
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto cleanup;
    }

    /* get the job object */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        /* must already be complete */
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors NULL jdata - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* if it is our own connection, ignore it */
        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted:proc_errors comm_failed to self - ignoring error",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto cleanup;
        }
        /* was it a daemon? */
        if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
            /* nope - we can't seem to trust that we will catch the waitpid
             * in this situation, so push this over to be handled as if
             * it were a waitpid trigger so we don't create a bunch of
             * duplicate code */
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted:proc_errors comm_failed to non-daemon - handling as waitpid",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            /* get the proc_t */
            if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                goto cleanup;
            }
            /* leave the exit code alone - process this as a waitpid */
            odls_base_default_wait_local_proc(child, NULL);
            goto cleanup;
        }
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default:orted daemon %s exited",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* if we are using static ports, then it is possible that the HNP
         * will not see this termination. So if the HNP didn't order us
         * to terminate, then we should ensure it knows */
        if (orte_static_ports && !orte_orteds_term_ordered) {
            /* send an alert to the HNP */
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* get the proc_t */
            if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                goto cleanup;
            }
            /* set the exit code to reflect the problem */
            child->exit_code = ORTE_ERR_COMM_FAILURE;
            /* pack only the data for this daemon - have to start with the jobid
             * so the receiver can unpack it correctly
             */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }

            /* now pack the daemon's info */
            if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* send it */
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted reporting lost connection to daemon %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(proc)));
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(alert);
            }
            /* mark that we notified the HNP for this job so we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
            /* continue on */
            goto cleanup;
        }

        if (orte_orteds_term_ordered) {
            /* are any of my children still alive */
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
                        OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                             "%s errmgr:default:orted[%s(%d)] proc %s is alive",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                             __FILE__, __LINE__,
                                             ORTE_NAME_PRINT(&child->name)));
                        goto cleanup;
                    }
                }
            }
            /* if all my routes and children are gone, then terminate
               ourselves nicely (i.e., this is a normal termination) */
            if (0 == orte_routed.num_routes()) {
                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr:default:orted all routes gone - exiting",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            } else {
                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr:default:orted not exiting, num_routes() == %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     (int)orte_routed.num_routes()));
            }
        }
        /* if not, then we can continue */
        goto cleanup;
    }

    if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto cleanup;
    }
    /* if this is not a local proc for this job, we can
     * ignore this call
     */
    if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_LOCAL)) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors proc is not local - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:default_orted got state %s for proc %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         orte_proc_state_to_str(state),
                         ORTE_NAME_PRINT(proc)));

    if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
        /* update the state */
        child->state = state;
        /* report this as abnormal termination to the HNP, unless we already have
         * done so for this job */
        if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) {
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* pack only the data for this proc - have to start with the jobid
             * so the receiver can unpack it correctly
             */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }

            /* now pack the child's info */
            if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* send it */
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&child->name),
                                 jdata->num_local_procs));
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(alert);
            }
            /* mark that we notified the HNP for this job so we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
        }
        /* if the proc has terminated, notify the state machine */
        if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
            ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) &&
            !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
        goto cleanup;
    }

    if (ORTE_PROC_STATE_FAILED_TO_START == state ||
        ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) {
        /* update the proc state */
        child->state = state;
        /* count the proc as having "terminated" */
        jdata->num_terminated++;
        /* leave the error report in this case to the
         * state machine, which will receive notice
         * when all local procs have attempted to start
         * so that we send a consolidated error report
         * back to the HNP
         */
        goto cleanup;
    }

    if (ORTE_PROC_STATE_TERMINATED < state) {
        /* if we were ordered to terminate, see if
         * any of our routes or local children remain alive - if not, then
         * terminate ourselves. */
        if (orte_orteds_term_ordered) {
            /* mark the child as no longer alive and update the counters, if necessary.
             * we have to do this here as we aren't going to send this to the state
             * machine, and we want to keep the bookkeeping accurate just in case */
            if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
                ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
            }
            if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
                ORTE_FLAG_SET(child, ORTE_PROC_FLAG_RECORDED);
                jdata->num_terminated++;
            }
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
                        goto keep_going;
                    }
                }
            }
            /* if all my routes and children are gone, then terminate
               ourselves nicely (i.e., this is a normal termination) */
            if (0 == orte_routed.num_routes()) {
                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr:default:orted all routes gone - exiting",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            }
            /* no need to alert the HNP - we are already on our way out */
            goto cleanup;
        }

    keep_going:
        /* if the job hasn't completed and the state is abnormally
         * terminated, then we need to alert the HNP right away - but
         * only do this once!
         */
        if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) {
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* pack only the data for this proc - have to start with the jobid
             * so the receiver can unpack it correctly
             */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            child->state = state;
            /* now pack the child's info */
            if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&child->name),
                                 jdata->num_local_procs));
            /* send it */
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
            /* mark that we notified the HNP for this job so we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
        }
        /* if the proc has terminated, notify the state machine */
        if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
            ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) &&
            !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
        goto cleanup;
    }

    /* only other state is terminated - see if anyone is left alive */
    if (!any_live_children(proc->jobid)) {
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack the data for the job */
        if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
            ORTE_ERROR_LOG(rc);
            return;
        }

        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted reporting all procs in %s terminated",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jdata->jobid)));

        /* remove all of this job's children from the global list */
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                continue;
            }
            if (jdata->jobid == ptr->name.jobid) {
                opal_pointer_array_set_item(orte_local_children, i, NULL);
                OBJ_RELEASE(ptr);
            }
        }

        /* ensure the job's local session directory tree is removed */
        orte_session_dir_cleanup(jdata->jobid);

        /* remove this job from our local job data since it is complete */
        opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL);
        OBJ_RELEASE(jdata);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                              ORTE_RML_TAG_PLM,
                                              orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        return;
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
示例#4
0
static void job_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_job_state_t jobstate;
    int rc;
    orte_plm_cmd_flag_t cmd;
    opal_buffer_t *alert;

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        return;
    }

    /* if the jdata is NULL, then we abort as this
     * is reporting an unrecoverable error
     */
    if (NULL == caddy->jdata) {
        ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT);
        OBJ_RELEASE(caddy);
        return;
    }

    /* update the state */
    jdata = caddy->jdata;
    jobstate = caddy->job_state;
    jdata->state = jobstate;

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:default_orted: job %s reported error state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jdata->jobid),
                         orte_job_state_to_str(jobstate)));

    switch (jobstate) {
    case ORTE_JOB_STATE_FAILED_TO_START:
        failed_start(jdata);
        break;
    case ORTE_JOB_STATE_COMM_FAILED:
        /* kill all local procs */
        killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
        /* order termination */
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto cleanup;
        break;
    case ORTE_JOB_STATE_HEARTBEAT_FAILED:
        /* let the HNP handle this */
        goto cleanup;
        break;

    default:
        break;
    }
    alert = OBJ_NEW(opal_buffer_t);
    /* pack update state command */
    cmd = ORTE_PLM_UPDATE_PROC_STATE;
    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(alert);
        goto cleanup;
    }
    /* pack the job info */
    if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(alert);
        goto cleanup;
    }
    /* send it */
    if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                          ORTE_RML_TAG_PLM,
                                          orte_rml_send_callback, NULL))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(alert);
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
static void proc_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_proc_t *pptr;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;

    orte_proc_t *child, *ptr;
    opal_buffer_t *alert;
    orte_plm_cmd_flag_t cmd;
    int rc=ORTE_SUCCESS;
    orte_vpid_t null=ORTE_VPID_INVALID;
    orte_ns_cmp_bitmask_t mask=ORTE_NS_CMP_ALL;
    int i;

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s errmgr:default_orted:proc_errors process %s error state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /* if this is a heartbeat failure, let the HNP handle it */
    if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
        goto cleanup;
    }

    /* if this was a failed comm, then see if it was to our
     * lifeline
     */
    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* if it is our own connection, ignore it */
        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) {
            goto cleanup;
        }
        /* was it a daemon? */
        if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
            /* nope - ignore */
            goto cleanup;
        }
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                             "%s errmgr:default:orted daemon %s exited",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* see if this was a lifeline */
        if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                                 "%s errmgr:orted daemon %s was a lifeline - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(proc)));
            /* kill our children */
            killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
            /* terminate - our routed children will see
             * us leave and automatically die
             */
            ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            goto cleanup;
        }
        /* are any of my children still alive */
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                if (child->alive && child->state < ORTE_PROC_STATE_UNTERMINATED) {
                    goto cleanup;
                }
            }
        }
        /* if all my routes and children are gone, then terminate
           ourselves nicely (i.e., this is a normal termination) */
        if (0 == orte_routed.num_routes()) {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                                 "%s errmgr:default:orted all routes gone - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ORTE_TERMINATE(0);
        } else {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                                 "%s errmgr:default:orted not exiting, num_routes() == %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 (int)orte_routed.num_routes()));
        }
        /* if not, then we can continue */
        goto cleanup;
    }

    /* get the job object */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        /* must already be complete */
        goto cleanup;
    }
    pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    /* if there are no local procs for this job, we can
     * ignore this call
     */
    if (0 == jdata->num_local_procs) {
        goto cleanup;
    }

    /* find this proc in the local children */
    child = NULL;
    for (i=0; i < orte_local_children->size; i++) {
        if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
            continue;
        }
        if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &ptr->name, proc)) {
            child = ptr;
            break;
        }
    }
    if (NULL == child) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
                         "%s errmgr:default_orted got state %s for proc %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         orte_proc_state_to_str(state),
                         ORTE_NAME_PRINT(proc)));
 
    if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) {
        child->state = state;
        /* Decrement the number of local procs */
        jdata->num_local_procs--;
        /* kill this proc */
        killprocs(proc->jobid, proc->vpid);
        goto cleanup;
    }

    if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
        if (!orte_abort_non_zero_exit) {
            /* leave the child in orte_local_children so we can
             * later send the state info after full job termination
             */
            child->state = state;
            child->waitpid_recvd = true;
            if (child->iof_complete) {
                /* the proc has terminated */
                child->alive = false;
                /* Clean up the session directory as if we were the process
                 * itself.  This covers the case where the process died abnormally
                 * and didn't cleanup its own session directory.
                 */
                orte_session_dir_finalize(&child->name);
                /* track job status */
                jdata->num_terminated++;
            }
            /* treat this as normal termination */
            goto REPORT_STATE;
        }
        /* report this as abnormal termination to the HNP */
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack only the data for this proc - have to start with the jobid
         * so the receiver can unpack it correctly
         */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            return;
        }

        child->state = state;
        /* now pack the child's info */
        if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* remove the child from our local array as it is no longer alive */
        opal_pointer_array_set_item(orte_local_children, i, NULL);
        /* Decrement the number of local procs */
        jdata->num_local_procs--;

        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&child->name),
                             jdata->num_local_procs));
        
        /* release the child object */
        OBJ_RELEASE(child);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                              ORTE_RML_TAG_PLM, 0,
                                              orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(rc);
            OBJ_RELEASE(alert);
        }
        return;
    }

    if (ORTE_PROC_STATE_FAILED_TO_START == state ||
        ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) {
        /* update the proc state */
        child->state = state;
        /* count the proc as having "terminated" */
        jdata->num_terminated++;
        /* leave the error report in this case to the
         * state machine, which will receive notice
         * when all local procs have attempted to start
         * so that we send a consolidated error report
         * back to the HNP
         */
        goto cleanup;
    }

    if (ORTE_PROC_STATE_TERMINATED < state) {
        /* if the job hasn't completed and the state is abnormally
         * terminated, then we need to alert the HNP right away
         */
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack only the data for this proc - have to start with the jobid
         * so the receiver can unpack it correctly
         */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            return;
        }

        child->state = state;
        /* now pack the child's info */
        if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* remove the child from our local array as it is no longer alive */
        opal_pointer_array_set_item(orte_local_children, i, NULL);
        /* Decrement the number of local procs */
        jdata->num_local_procs--;

        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&child->name),
                             jdata->num_local_procs));
        
        /* release the child object */
        OBJ_RELEASE(child);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                              ORTE_RML_TAG_PLM, 0,
                                              orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        return;
    }

 REPORT_STATE:
    if (ORTE_PROC_STATE_REGISTERED == state) {
        /* see if everyone in this job has registered */
        if (all_children_registered(proc->jobid)) {
            /* once everyone registers, send their contact info to
             * the HNP so it is available to debuggers and anyone
             * else that needs it
             */

            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                                 "%s errmgr:default_orted: sending contact info to HNP",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            
            alert = OBJ_NEW(opal_buffer_t);
            /* pack init routes command */
            cmd = ORTE_PLM_INIT_ROUTES_CMD;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* pack the jobid */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* pack all the local child vpids */
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    continue;
                }
                if (ptr->name.jobid == proc->jobid) {
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ptr->name.vpid, 1, ORTE_VPID))) {
                        ORTE_ERROR_LOG(rc);
                        return;
                    }
                }
            }
            /* pack an invalid marker */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* add in contact info for all procs in the job */
            if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) {
                ORTE_ERROR_LOG(rc);
                OBJ_DESTRUCT(&alert);
                return;
            }
            /* send it */
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM, 0,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        }        
        return;
    }

    /* only other state is terminated - see if anyone is left alive */
    if (!any_live_children(proc->jobid)) {
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack the data for the job */
        if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
            ORTE_ERROR_LOG(rc);
            return;
        }

        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:default_orted reporting all procs in %s terminated",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jdata->jobid)));
        
        /* remove all of this job's children from the global list - do not lock
         * the thread as we are already locked
         */
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                continue;
            }
            if (jdata->jobid == ptr->name.jobid) {
                opal_pointer_array_set_item(orte_local_children, i, NULL);
                OBJ_RELEASE(ptr);
            }
        }

        /* ensure the job's local session directory tree is removed */
        orte_session_dir_cleanup(jdata->jobid);

        /* remove this job from our local job data since it is complete */
        opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL);
        OBJ_RELEASE(jdata);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                              ORTE_RML_TAG_PLM, 0,
                                              orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        return;
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
示例#6
0
文件: acme.c 项目: Nurb432/plan9front
void
threadmain(int argc, char *argv[])
{
	int i;
	char *p, *loadfile;
	char buf[256];
	Column *c;
	int ncol;
	Display *d;
	static void *arg[1];

	rfork(RFENVG|RFNAMEG);

	ncol = -1;

	loadfile = nil;
	ARGBEGIN{
	case 'a':
		globalautoindent = TRUE;
		break;
	case 'b':
		bartflag = TRUE;
		break;
	case 'c':
		p = ARGF();
		if(p == nil)
			goto Usage;
		ncol = atoi(p);
		if(ncol <= 0)
			goto Usage;
		break;
	case 'f':
		fontnames[0] = ARGF();
		if(fontnames[0] == nil)
			goto Usage;
		break;
	case 'F':
		fontnames[1] = ARGF();
		if(fontnames[1] == nil)
			goto Usage;
		break;
	case 'l':
		loadfile = ARGF();
		if(loadfile == nil)
			goto Usage;
		break;
	default:
	Usage:
		fprint(2, "usage: acme [-ab] [-c ncol] [-f font] [-F fixedfont] [-l loadfile | file...]\n");
		exits("usage");
	}ARGEND

	if(fontnames[0] == nil)
		fontnames[0] = getenv("font");
	if(fontnames[0] == nil)
		fontnames[0] = "/lib/font/bit/vga/unicode.font";
	if(access(fontnames[0], 0) < 0){
		fprint(2, "acme: can't access %s: %r\n", fontnames[0]);
		exits("font open");
	}
	if(fontnames[1] == nil)
		fontnames[1] = fontnames[0];
	fontnames[0] = estrdup(fontnames[0]);
	fontnames[1] = estrdup(fontnames[1]);

	quotefmtinstall();
	cputype = getenv("cputype");
	objtype = getenv("objtype");
	home = getenv("home");
	p = getenv("tabstop");
	if(p != nil){
		maxtab = strtoul(p, nil, 0);
		free(p);
	}
	if(maxtab == 0)
		maxtab = 4; 
	if(loadfile)
		rowloadfonts(loadfile);
	putenv("font", fontnames[0]);
	snarffd = open("/dev/snarf", OREAD|OCEXEC);
	if(cputype){
		sprint(buf, "/acme/bin/%s", cputype);
		bind(buf, "/bin", MBEFORE);
	}
	bind("/acme/bin", "/bin", MBEFORE);
	getwd(wdir, sizeof wdir);

	if(geninitdraw(nil, derror, fontnames[0], "acme", nil, Refnone) < 0){
		fprint(2, "acme: can't open display: %r\n");
		exits("geninitdraw");
	}
	d = display;
	font = d->defaultfont;

	reffont.f = font;
	reffonts[0] = &reffont;
	incref(&reffont);	/* one to hold up 'font' variable */
	incref(&reffont);	/* one to hold up reffonts[0] */
	fontcache = emalloc(sizeof(Reffont*));
	nfontcache = 1;
	fontcache[0] = &reffont;

	iconinit();
	timerinit();
	rxinit();

	cwait = threadwaitchan();
	ccommand = chancreate(sizeof(Command**), 0);
	ckill = chancreate(sizeof(Rune*), 0);
	cxfidalloc = chancreate(sizeof(Xfid*), 0);
	cxfidfree = chancreate(sizeof(Xfid*), 0);
	cnewwindow = chancreate(sizeof(Channel*), 0);
	cerr = chancreate(sizeof(char*), 0);
	cedit = chancreate(sizeof(int), 0);
	cexit = chancreate(sizeof(int), 0);
	cwarn = chancreate(sizeof(void*), 1);
	if(cwait==nil || ccommand==nil || ckill==nil || cxfidalloc==nil || cxfidfree==nil || cerr==nil || cexit==nil || cwarn==nil){
		fprint(2, "acme: can't create initial channels: %r\n");
		threadexitsall("channels");
	}

	mousectl = initmouse(nil, screen);
	if(mousectl == nil){
		fprint(2, "acme: can't initialize mouse: %r\n");
		threadexitsall("mouse");
	}
	mouse = mousectl;
	keyboardctl = initkeyboard(nil);
	if(keyboardctl == nil){
		fprint(2, "acme: can't initialize keyboard: %r\n");
		threadexitsall("keyboard");
	}
	mainpid = getpid();
	plumbeditfd = plumbopen("edit", OREAD|OCEXEC);
	if(plumbeditfd >= 0){
		cplumb = chancreate(sizeof(Plumbmsg*), 0);
		proccreate(plumbproc, nil, STACK);
	}
	plumbsendfd = plumbopen("send", OWRITE|OCEXEC);

	fsysinit();

	#define	WPERCOL	8
	disk = diskinit();
	if(!loadfile || !rowload(&row, loadfile, TRUE)){
		rowinit(&row, screen->clipr);
		if(ncol < 0){
			if(argc == 0)
				ncol = 2;
			else{
				ncol = (argc+(WPERCOL-1))/WPERCOL;
				if(ncol < 2)
					ncol = 2;
			}
		}
		if(ncol == 0)
			ncol = 2;
		for(i=0; i<ncol; i++){
			c = rowadd(&row, nil, -1);
			if(c==nil && i==0)
				error("initializing columns");
		}
		c = row.col[row.ncol-1];
		if(argc == 0)
			readfile(c, wdir);
		else
			for(i=0; i<argc; i++){
				p = utfrrune(argv[i], '/');
				if((p!=nil && strcmp(p, "/guide")==0) || i/WPERCOL>=row.ncol)
					readfile(c, argv[i]);
				else
					readfile(row.col[i/WPERCOL], argv[i]);
			}
	}
	flushimage(display, 1);

	acmeerrorinit();
	threadcreate(keyboardthread, nil, STACK);
	threadcreate(mousethread, nil, STACK);
	threadcreate(waitthread, nil, STACK);
	threadcreate(xfidallocthread, nil, STACK);
	threadcreate(newwindowthread, nil, STACK);

	threadnotify(shutdown, 1);
	recvul(cexit);
	killprocs();
	threadexitsall(nil);
}
示例#7
0
static int update_state(orte_jobid_t job,
                        orte_job_state_t jobstate,
                        orte_process_name_t *proc,
                        orte_proc_state_t state,
                        pid_t pid,
                        orte_exit_code_t exit_code)
{
    opal_list_item_t *item, *next;
    orte_odls_job_t *jobdat = NULL;
    orte_odls_child_t *child;
    opal_buffer_t *alert;
    orte_plm_cmd_flag_t cmd;
    int rc=ORTE_SUCCESS;
    orte_vpid_t null=ORTE_VPID_INVALID;
    orte_ns_cmp_bitmask_t mask;

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        return ORTE_SUCCESS;
    }

    OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
                "errmgr:default_orted:update_state() %s) "
                "------- %s state updated for process %s",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                ((NULL == proc) ? "App. Process" : 
                 (proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")),
                (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));

    /* if this is a heartbeat failure, let the HNP handle it */
    if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate ||
        ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
        return ORTE_SUCCESS;
    }

    /***   UPDATE COMMAND FOR A JOB   ***/
    if (NULL == proc) {
        /* this is an update for an entire job */
        if (ORTE_JOBID_INVALID == job) {
            /* whatever happened, we don't know what job
             * it happened to
             */
            orte_show_help("help-orte-errmgr.txt", "errmgr:unknown-job-error",
                           true, orte_job_state_to_str(jobstate));
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            /* pack the "invalid" jobid */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &job, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            } else {
                rc = ORTE_SUCCESS;
            }
            return rc;
        }

        /* lookup the local jobdat for this job */
        jobdat = NULL;
        for (item = opal_list_get_first(&orte_local_jobdata);
             item != opal_list_get_end(&orte_local_jobdata);
             item = opal_list_get_next(item)) {
            jobdat = (orte_odls_job_t*)item;

            /* is this the specified job? */
            if (jobdat->jobid == job) {
                break;
            }
        }
        if (NULL == jobdat) {
            return ORTE_ERR_NOT_FOUND;
        }

        switch (jobstate) {
        case ORTE_JOB_STATE_FAILED_TO_START:
            failed_start(jobdat, exit_code);
            break;
        case ORTE_JOB_STATE_RUNNING:
            /* update all local child states */
            update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING);
            break;
        case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
            /* update all procs in job */
            update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
            /* order all local procs for this job to be killed */
            killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
        case ORTE_JOB_STATE_COMM_FAILED:
            /* kill all local procs */
            killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
            /* tell the caller we can't recover */
            return ORTE_ERR_UNRECOVERABLE;
            break;
        case ORTE_JOB_STATE_HEARTBEAT_FAILED:
            /* let the HNP handle this */
            return ORTE_SUCCESS;
            break;

        default:
            break;
        }
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            goto FINAL_CLEANUP;
        }
        /* pack the job info */
        if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) {
            ORTE_ERROR_LOG(rc);
        }
        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
            ORTE_ERROR_LOG(rc);
        } else {
            rc = ORTE_SUCCESS;
        }
        return rc;
    }

    /* if this was a failed comm, then see if it was to our
     * lifeline
     */
    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* if it is our own connection, ignore it */
        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) {
            return ORTE_SUCCESS;
        }
        /* was it a daemon? */
        if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
            /* nope - ignore */
            return ORTE_SUCCESS;
        }
        /* see if this was a lifeline */
        if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
            /* kill our children */
            killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
            /* terminate - our routed children will see
             * us leave and automatically die
             */
            orte_quit();
        }
        /* if not, then indicate we can continue */
        return ORTE_SUCCESS;
    }

    /* lookup the local jobdat for this job */
    jobdat = NULL;
    for (item = opal_list_get_first(&orte_local_jobdata);
         item != opal_list_get_end(&orte_local_jobdata);
         item = opal_list_get_next(item)) {
        jobdat = (orte_odls_job_t*)item;

        /* is this the specified job? */
        if (jobdat->jobid == proc->jobid) {
            break;
        }
    }
    if (NULL == jobdat) {
        /* must already be complete */
        return ORTE_SUCCESS;
    }

    /* if there are no local procs for this job, we can
     * ignore this call
     */
    if (0 == jobdat->num_local_procs) {
        return ORTE_SUCCESS;
    }

    OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                         "%s errmgr:default_orted got state %s for proc %s pid %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         orte_proc_state_to_str(state),
                         ORTE_NAME_PRINT(proc), pid));
 
    /***  UPDATE COMMAND FOR A SPECIFIC PROCESS ***/
    if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) {
        /* find this proc in the local children */
        for (item = opal_list_get_first(&orte_local_children);
             item != opal_list_get_end(&orte_local_children);
             item = opal_list_get_next(item)) {
            child = (orte_odls_child_t*)item;
            mask = ORTE_NS_CMP_ALL;
            if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
                if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
                    child->state = state;
                    child->exit_code = exit_code;
                    /* Decrement the number of local procs */
                    jobdat->num_local_procs--;
                    /* kill this proc */
                    killprocs(proc->jobid, proc->vpid);
                }
                return ORTE_SUCCESS;
            }
        }
    }

    if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
        if (!orte_abort_non_zero_exit) {
            /* treat this as normal termination */
            goto REPORT_STATE;
        }
    }

    if (ORTE_PROC_STATE_TERMINATED < state) {
        /* if the job hasn't completed and the state is abnormally
         * terminated, then we need to alert the HNP right away
         */
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            goto FINAL_CLEANUP;
        }
        /* pack only the data for this proc - have to start with the jobid
         * so the receiver can unpack it correctly
         */
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* find this proc in the local children */
        for (item = opal_list_get_first(&orte_local_children);
             item != opal_list_get_end(&orte_local_children);
             item = opal_list_get_next(item)) {
            child = (orte_odls_child_t*)item;
            mask = ORTE_NS_CMP_ALL;
            if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
                if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
                    child->state = state;
                    child->exit_code = exit_code;
                }
                /* now pack the child's info */
                if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
                /* remove the child from our local list as it is no longer alive */
                opal_list_remove_item(&orte_local_children, &child->super);
                /* Decrement the number of local procs */
                jobdat->num_local_procs--;

                OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                                     "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(child->name),
                                     jobdat->num_local_procs));
                
                /* release the child object */
                OBJ_RELEASE(child);
                /* done with loop */
                break;
            }
        }

        /* send it */
    if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
            ORTE_ERROR_LOG(rc);
        } else {
            rc = ORTE_SUCCESS;
        }
        return rc;
    }

 REPORT_STATE:
    /* find this proc in the local children so we can update its state */
    for (item = opal_list_get_first(&orte_local_children);
         item != opal_list_get_end(&orte_local_children);
         item = opal_list_get_next(item)) {
        child = (orte_odls_child_t*)item;
        mask = ORTE_NS_CMP_ALL;
        if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
            if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
                child->state = state;
                if (0 < pid) {
                    child->pid = pid;
                }
                child->exit_code = exit_code;
            }
            /* done with loop */
            break;
        }
    }

    if (ORTE_PROC_STATE_REGISTERED == state) {
        /* see if everyone in this job has registered */
        if (all_children_registered(proc->jobid)) {
            /* once everyone registers, send their contact info to
             * the HNP so it is available to debuggers and anyone
             * else that needs it
             */

            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                                 "%s errmgr:default_orted: sending contact info to HNP",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            
            alert = OBJ_NEW(opal_buffer_t);
            /* pack init routes command */
            cmd = ORTE_PLM_INIT_ROUTES_CMD;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                goto FINAL_CLEANUP;
            }
            /* pack the jobid */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                goto FINAL_CLEANUP;
            }
            /* pack all the local child vpids and epochs */
            for (item = opal_list_get_first(&orte_local_children);
                 item != opal_list_get_end(&orte_local_children);
                 item = opal_list_get_next(item)) {
                child = (orte_odls_child_t*)item;
                if (child->name->jobid == proc->jobid) {
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->name->vpid, 1, ORTE_VPID))) {
                        ORTE_ERROR_LOG(rc);
                        goto FINAL_CLEANUP;
                    }
                }
            }
            /* pack an invalid marker */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
                ORTE_ERROR_LOG(rc);
                goto FINAL_CLEANUP;
            }
            /* add in contact info for all procs in the job */
            if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) {
                ORTE_ERROR_LOG(rc);
                OBJ_DESTRUCT(&alert);
                return rc;
            }
            /* send it */
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
                ORTE_ERROR_LOG(rc);
            } else {
                rc = ORTE_SUCCESS;
            }
        }        
        return rc;
    }

    /* only other state is terminated - see if anyone is left alive */
    if (!any_live_children(proc->jobid)) {
        /* lookup the local jobdat for this job */
        jobdat = NULL;
        for (item = opal_list_get_first(&orte_local_jobdata);
             item != opal_list_get_end(&orte_local_jobdata);
             item = opal_list_get_next(item)) {
            jobdat = (orte_odls_job_t*)item;

            /* is this the specified job? */
            if (jobdat->jobid == proc->jobid) {
                break;
            }
        }
        if (NULL == jobdat) {
            /* race condition - may not have been formed yet */
            return ORTE_SUCCESS;
        }

        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            goto FINAL_CLEANUP;
        }
        /* pack the data for the job */
        if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) {
            ORTE_ERROR_LOG(rc);
        }

FINAL_CLEANUP:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
                             "%s errmgr:default_orted reporting all procs in %s terminated",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jobdat->jobid)));
        
        /* remove all of this job's children from the global list - do not lock
         * the thread as we are already locked
         */
        for (item = opal_list_get_first(&orte_local_children);
             item != opal_list_get_end(&orte_local_children);
             item = next) {
            child = (orte_odls_child_t*)item;
            next = opal_list_get_next(item);

            if (jobdat->jobid == child->name->jobid) {
                opal_list_remove_item(&orte_local_children, &child->super);
                OBJ_RELEASE(child);
            }
        }

        /* ensure the job's local session directory tree is removed */
        orte_session_dir_cleanup(jobdat->jobid);

        /* remove this job from our local job data since it is complete */
        opal_list_remove_item(&orte_local_jobdata, &jobdat->super);
        OBJ_RELEASE(jobdat);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) {
            ORTE_ERROR_LOG(rc);
        } else {
            rc = ORTE_SUCCESS;
        }

        /* indicate that the job is complete */
        return rc;
    }
    return ORTE_SUCCESS;
}