Esempio n. 1
0
int
umask(int mask)
{
	mode_t t;

	t = PTOU(curproc)->u_cmask;
	PTOU(curproc)->u_cmask =  (mode_t)(mask & PERMMASK);
	return ((int)t);
}
Esempio n. 2
0
/*
 * Kernel setup code, called from startup().
 */
void
kern_setup1(void)
{
	proc_t *pp;

	pp = &p0;

	proc_sched = pp;

	/*
	 * Initialize process 0 data structures
	 */
	pp->p_stat = SRUN;
	pp->p_flag = SSYS;

	pp->p_pidp = &pid0;
	pp->p_pgidp = &pid0;
	pp->p_sessp = &session0;
	pp->p_tlist = &t0;
	pid0.pid_pglink = pp;
	pid0.pid_pgtail = pp;

	/*
	 * XXX - we asssume that the u-area is zeroed out except for
	 * ttolwp(curthread)->lwp_regs.
	 */
	PTOU(curproc)->u_cmask = (mode_t)CMASK;

	thread_init();		/* init thread_free list */
	pid_init();		/* initialize pid (proc) table */
	contract_init();	/* initialize contracts */

	init_pages_pp_maximum();
}
Esempio n. 3
0
/*
 * Lookup the user file name from a given vp, using a specific credential.
 */
int
lookuppnatcred(
	struct pathname *pnp,		/* pathname to lookup */
	struct pathname *rpnp,		/* if non-NULL, return resolved path */
	int followlink,			/* (don't) follow sym links */
	vnode_t **dirvpp,		/* ptr for parent vnode */
	vnode_t **compvpp,		/* ptr for entry vnode */
	vnode_t *startvp,		/* start search from this vp */
	cred_t *cr)			/* user credential */
{
	vnode_t *vp;	/* current directory vp */
	vnode_t *rootvp;
	proc_t *p = curproc;

	if (pnp->pn_pathlen == 0)
		return (ENOENT);

	mutex_enter(&p->p_lock);	/* for u_rdir and u_cdir */
	if ((rootvp = PTOU(p)->u_rdir) == NULL)
		rootvp = rootdir;
	else if (rootvp != rootdir)	/* no need to VN_HOLD rootdir */
		VN_HOLD(rootvp);

	if (pnp->pn_path[0] == '/') {
		vp = rootvp;
	} else {
		vp = (startvp == NULL) ? PTOU(p)->u_cdir : startvp;
	}
	VN_HOLD(vp);
	mutex_exit(&p->p_lock);

	/*
	 * Skip over leading slashes
	 */
	if (pnp->pn_path[0] == '/') {
		do {
			pnp->pn_path++;
			pnp->pn_pathlen--;
		} while (pnp->pn_path[0] == '/');
	}

	return (lookuppnvp(pnp, rpnp, followlink, dirvpp,
	    compvpp, rootvp, vp, cr));
}
Esempio n. 4
0
/*
 * Make a directory.
 */
int
mkdir(char *dname, int dmode)
{
	vnode_t *vp;
	struct vattr vattr;
	int error;

	vattr.va_type = VDIR;
	vattr.va_mode = dmode & PERMMASK;
	vattr.va_mask = AT_TYPE|AT_MODE;
	error = vn_create(dname, UIO_USERSPACE, &vattr, EXCL, 0, &vp, CRMKDIR,
	    0, PTOU(curproc)->u_cmask);
	if (error)
		return (set_errno(error));
	VN_RELE(vp);
	return (0);
}
Esempio n. 5
0
/*
 * Native processes are started with the native ld.so.1 as the command.  This
 * brand op is invoked by s10_npreload to fix up the command and arguments
 * so that apps like pgrep or ps see the expected command strings.
 */
int
s10_native(void *cmd, void *args)
{
	struct user	*up = PTOU(curproc);
	char		cmd_buf[MAXCOMLEN + 1];
	char		arg_buf[PSARGSZ];

	if (copyin(cmd, &cmd_buf, sizeof (cmd_buf)) != 0)
		return (EFAULT);
	if (copyin(args, &arg_buf, sizeof (arg_buf)) != 0)
		return (EFAULT);

	/*
	 * Make sure that the process' interpreter is the native dynamic linker.
	 * Convention dictates that native processes executing within solaris10-
	 * branded zones are interpreted by the native dynamic linker (the
	 * process and its arguments are specified as arguments to the dynamic
	 * linker).  If this convention is violated (i.e.,
	 * brandsys(B_S10_NATIVE, ...) is invoked by a process that shouldn't be
	 * native), then do nothing and silently indicate success.
	 */
	if (strcmp(up->u_comm, S10_LINKER_NAME) != 0)
		return (0);

	/*
	 * The sizeof has an extra value for the trailing '\0' so this covers
	 * the appended " " in the following strcmps.
	 */
	if (strncmp(up->u_psargs, BRAND_NATIVE_LINKER64 " ",
	    sizeof (BRAND_NATIVE_LINKER64)) != 0 &&
	    strncmp(up->u_psargs, BRAND_NATIVE_LINKER32 " ",
	    sizeof (BRAND_NATIVE_LINKER32)) != 0)
		return (0);

	mutex_enter(&curproc->p_lock);
	(void) strlcpy(up->u_comm, cmd_buf, sizeof (up->u_comm));
	(void) strlcpy(up->u_psargs, arg_buf, sizeof (up->u_psargs));
	mutex_exit(&curproc->p_lock);

	return (0);
}
Esempio n. 6
0
/*
 * Perform pre-system-call processing, including stopping for tracing,
 * auditing, microstate-accounting, etc.
 *
 * This routine is called only if the t_pre_sys flag is set.  Any condition
 * requiring pre-syscall handling must set the t_pre_sys flag.  If the
 * condition is persistent, this routine will repost t_pre_sys.
 */
int
pre_syscall(int arg0)
{
	unsigned int code;
	kthread_t *t = curthread;
	proc_t *p = ttoproc(t);
	klwp_t *lwp = ttolwp(t);
	struct regs *rp = lwptoregs(lwp);
	int	repost;

	t->t_pre_sys = repost = 0;	/* clear pre-syscall processing flag */

	ASSERT(t->t_schedflag & TS_DONT_SWAP);

	syscall_mstate(LMS_USER, LMS_SYSTEM);

	/*
	 * The syscall arguments in the out registers should be pointed to
	 * by lwp_ap.  If the args need to be copied so that the outs can
	 * be changed without losing the ability to get the args for /proc,
	 * they can be saved by save_syscall_args(), and lwp_ap will be
	 * restored by post_syscall().
	 */
	ASSERT(lwp->lwp_ap == (long *)&rp->r_o0);

	/*
	 * Make sure the thread is holding the latest credentials for the
	 * process.  The credentials in the process right now apply to this
	 * thread for the entire system call.
	 */
	if (t->t_cred != p->p_cred) {
		cred_t *oldcred = t->t_cred;
		/*
		 * DTrace accesses t_cred in probe context.  t_cred must
		 * always be either NULL, or point to a valid, allocated cred
		 * structure.
		 */
		t->t_cred = crgetcred();
		crfree(oldcred);
	}

	/*
	 * Undo special arrangements to single-step the lwp
	 * so that a debugger will see valid register contents.
	 * Also so that the pc is valid for syncfpu().
	 * Also so that a syscall like exec() can be stepped.
	 */
	if (lwp->lwp_pcb.pcb_step != STEP_NONE) {
		(void) prundostep();
		repost = 1;
	}

	/*
	 * Check for indirect system call in case we stop for tracing.
	 * Don't allow multiple indirection.
	 */
	code = t->t_sysnum;
	if (code == 0 && arg0 != 0) {		/* indirect syscall */
		code = arg0;
		t->t_sysnum = arg0;
	}

	/*
	 * From the proc(4) manual page:
	 * When entry to a system call is being traced, the traced process
	 * stops after having begun the call to the system but before the
	 * system call arguments have been fetched from the process.
	 * If proc changes the args we must refetch them after starting.
	 */
	if (PTOU(p)->u_systrap) {
		if (prismember(&PTOU(p)->u_entrymask, code)) {
			/*
			 * Recheck stop condition, now that lock is held.
			 */
			mutex_enter(&p->p_lock);
			if (PTOU(p)->u_systrap &&
			    prismember(&PTOU(p)->u_entrymask, code)) {
				stop(PR_SYSENTRY, code);
				/*
				 * Must refetch args since they were
				 * possibly modified by /proc.  Indicate
				 * that the valid copy is in the
				 * registers.
				 */
				lwp->lwp_argsaved = 0;
				lwp->lwp_ap = (long *)&rp->r_o0;
			}
			mutex_exit(&p->p_lock);
		}
		repost = 1;
	}

	if (lwp->lwp_sysabort) {
		/*
		 * lwp_sysabort may have been set via /proc while the process
		 * was stopped on PR_SYSENTRY.  If so, abort the system call.
		 * Override any error from the copyin() of the arguments.
		 */
		lwp->lwp_sysabort = 0;
		(void) set_errno(EINTR); /* sets post-sys processing */
		t->t_pre_sys = 1;	/* repost anyway */
		return (1);		/* don't do system call, return EINTR */
	}

#ifdef C2_AUDIT
	if (audit_active) {	/* begin auditing for this syscall */
		int error;
		if (error = audit_start(T_SYSCALL, code, 0, lwp)) {
			t->t_pre_sys = 1;	/* repost anyway */
			lwp->lwp_error = 0;	/* for old drivers */
			return (error);
		}
		repost = 1;
	}
#endif /* C2_AUDIT */

#ifndef NPROBE
	/* Kernel probe */
	if (tnf_tracing_active) {
		TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */,
			tnf_sysnum,	sysnum,		t->t_sysnum);
		t->t_post_sys = 1;	/* make sure post_syscall runs */
		repost = 1;
	}
#endif /* NPROBE */

#ifdef SYSCALLTRACE
	if (syscalltrace) {
		int i;
		long *ap;
		char *cp;
		char *sysname;
		struct sysent *callp;

		if (code >= NSYSCALL)
			callp = &nosys_ent;	/* nosys has no args */
		else
			callp = LWP_GETSYSENT(lwp) + code;
		(void) save_syscall_args();
		mutex_enter(&systrace_lock);
		printf("%d: ", p->p_pid);
		if (code >= NSYSCALL)
			printf("0x%x", code);
		else {
			sysname = mod_getsysname(code);
			printf("%s[0x%x]", sysname == NULL ? "NULL" :
			    sysname, code);
		}
		cp = "(";
		for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
			printf("%s%lx", cp, *ap);
			cp = ", ";
		}
		if (i)
			printf(")");
		printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
		mutex_exit(&systrace_lock);
	}
#endif /* SYSCALLTRACE */

	/*
	 * If there was a continuing reason for pre-syscall processing,
	 * set the t_pre_sys flag for the next system call.
	 */
	if (repost)
		t->t_pre_sys = 1;
	lwp->lwp_error = 0;	/* for old drivers */
	lwp->lwp_badpriv = PRIV_NONE;	/* for privilege tracing */
	return (0);
}
Esempio n. 7
0
static int
chdirec(vnode_t *vp, int ischroot, int do_traverse)
{
	int error;
	vnode_t *oldvp;
	proc_t *pp = curproc;
	vnode_t **vpp;
	refstr_t *cwd;
	int newcwd = 1;

	if (vp->v_type != VDIR) {
		error = ENOTDIR;
		goto bad;
	}
	if (error = VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL))
		goto bad;

	/*
	 * The VOP_ACCESS() may have covered 'vp' with a new filesystem,
	 * if 'vp' is an autoFS vnode. Traverse the mountpoint so
	 * that we don't end up with a covered current directory.
	 */
	if (vn_mountedvfs(vp) != NULL && do_traverse) {
		if (error = traverse(&vp))
			goto bad;
	}

	/*
	 * Special chroot semantics: chroot is allowed if privileged
	 * or if the target is really a loopback mount of the root (or
	 * root of the zone) as determined by comparing dev and inode
	 * numbers
	 */
	if (ischroot) {
		struct vattr tattr;
		struct vattr rattr;
		vnode_t *zonevp = curproc->p_zone->zone_rootvp;

		tattr.va_mask = AT_FSID|AT_NODEID;
		if (error = VOP_GETATTR(vp, &tattr, 0, CRED(), NULL))
			goto bad;

		rattr.va_mask = AT_FSID|AT_NODEID;
		if (error = VOP_GETATTR(zonevp, &rattr, 0, CRED(), NULL))
			goto bad;

		if ((tattr.va_fsid != rattr.va_fsid ||
		    tattr.va_nodeid != rattr.va_nodeid) &&
		    (error = secpolicy_chroot(CRED())) != 0)
			goto bad;

		vpp = &PTOU(pp)->u_rdir;
	} else {
		vpp = &PTOU(pp)->u_cdir;
	}

	if (audit_active)	/* update abs cwd/root path see c2audit.c */
		audit_chdirec(vp, vpp);

	mutex_enter(&pp->p_lock);
	/*
	 * This bit of logic prevents us from overwriting u_cwd if we are
	 * changing to the same directory.  We set the cwd to NULL so that we
	 * don't try to do the lookup on the next call to getcwd().
	 */
	if (!ischroot && *vpp != NULL && vp != NULL && VN_CMP(*vpp, vp))
		newcwd = 0;

	oldvp = *vpp;
	*vpp = vp;
	if ((cwd = PTOU(pp)->u_cwd) != NULL && newcwd)
		PTOU(pp)->u_cwd = NULL;
	mutex_exit(&pp->p_lock);

	if (cwd && newcwd)
		refstr_rele(cwd);
	if (oldvp)
		VN_RELE(oldvp);
	return (0);

bad:
	VN_RELE(vp);
	return (error);
}
Esempio n. 8
0
/*
 * Perform pre-system-call processing, including stopping for tracing,
 * auditing, etc.
 *
 * This routine is called only if the t_pre_sys flag is set. Any condition
 * requiring pre-syscall handling must set the t_pre_sys flag. If the
 * condition is persistent, this routine will repost t_pre_sys.
 */
int
pre_syscall()
{
	kthread_t *t = curthread;
	unsigned code = t->t_sysnum;
	klwp_t *lwp = ttolwp(t);
	proc_t *p = ttoproc(t);
	int	repost;

	t->t_pre_sys = repost = 0;	/* clear pre-syscall processing flag */

	ASSERT(t->t_schedflag & TS_DONT_SWAP);

#if defined(DEBUG)
	/*
	 * On the i386 kernel, lwp_ap points at the piece of the thread
	 * stack that we copy the users arguments into.
	 *
	 * On the amd64 kernel, the syscall arguments in the rdi..r9
	 * registers should be pointed at by lwp_ap.  If the args need to
	 * be copied so that those registers can be changed without losing
	 * the ability to get the args for /proc, they can be saved by
	 * save_syscall_args(), and lwp_ap will be restored by post_syscall().
	 */
	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
#if defined(_LP64)
		ASSERT(lwp->lwp_ap == (long *)&lwptoregs(lwp)->r_rdi);
	} else {
#endif
		ASSERT((caddr_t)lwp->lwp_ap > t->t_stkbase &&
			(caddr_t)lwp->lwp_ap < t->t_stk);
	}
#endif	/* DEBUG */

	/*
	 * Make sure the thread is holding the latest credentials for the
	 * process.  The credentials in the process right now apply to this
	 * thread for the entire system call.
	 */
	if (t->t_cred != p->p_cred) {
		cred_t *oldcred = t->t_cred;
		/*
		 * DTrace accesses t_cred in probe context.  t_cred must
		 * always be either NULL, or point to a valid, allocated cred
		 * structure.
		 */
		t->t_cred = crgetcred();
		crfree(oldcred);
	}

	/*
	 * From the proc(4) manual page:
	 * When entry to a system call is being traced, the traced process
	 * stops after having begun the call to the system but before the
	 * system call arguments have been fetched from the process.
	 */
	if (PTOU(p)->u_systrap) {
		if (prismember(&PTOU(p)->u_entrymask, code)) {
			mutex_enter(&p->p_lock);
			/*
			 * Recheck stop condition, now that lock is held.
			 */
			if (PTOU(p)->u_systrap &&
			    prismember(&PTOU(p)->u_entrymask, code)) {
				stop(PR_SYSENTRY, code);

				/*
				 * /proc may have modified syscall args,
				 * either in regs for amd64 or on ustack
				 * for ia32.  Either way, arrange to
				 * copy them again, both for the syscall
				 * handler and for other consumers in
				 * post_syscall (like audit).  Here, we
				 * only do amd64, and just set lwp_ap
				 * back to the kernel-entry stack copy;
				 * the syscall ml code redoes
				 * move-from-regs to set up for the
				 * syscall handler after we return.  For
				 * ia32, save_syscall_args() below makes
				 * an lwp_ap-accessible copy.
				 */
#if defined(_LP64)
				if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
					lwp->lwp_argsaved = 0;
					lwp->lwp_ap =
					    (long *)&lwptoregs(lwp)->r_rdi;
				}
#endif
			}
			mutex_exit(&p->p_lock);
		}
		repost = 1;
	}

	/*
	 * ia32 kernel, or ia32 proc on amd64 kernel: keep args in
	 * lwp_arg for post-syscall processing, regardless of whether
	 * they might have been changed in /proc above.
	 */
#if defined(_LP64)
	if (lwp_getdatamodel(lwp) != DATAMODEL_NATIVE)
#endif
		(void) save_syscall_args();

	if (lwp->lwp_sysabort) {
		/*
		 * lwp_sysabort may have been set via /proc while the process
		 * was stopped on PR_SYSENTRY.  If so, abort the system call.
		 * Override any error from the copyin() of the arguments.
		 */
		lwp->lwp_sysabort = 0;
		(void) set_errno(EINTR);	/* forces post_sys */
		t->t_pre_sys = 1;	/* repost anyway */
		return (1);		/* don't do system call, return EINTR */
	}

#ifdef C2_AUDIT
	if (audit_active) {	/* begin auditing for this syscall */
		int error;
		if (error = audit_start(T_SYSCALL, code, 0, lwp)) {
			t->t_pre_sys = 1;	/* repost anyway */
			(void) set_errno(error);
			return (1);
		}
		repost = 1;
	}
#endif /* C2_AUDIT */

#ifndef NPROBE
	/* Kernel probe */
	if (tnf_tracing_active) {
		TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */,
			tnf_sysnum,	sysnum,		t->t_sysnum);
		t->t_post_sys = 1;	/* make sure post_syscall runs */
		repost = 1;
	}
#endif /* NPROBE */

#ifdef SYSCALLTRACE
	if (syscalltrace) {
		int i;
		long *ap;
		char *cp;
		char *sysname;
		struct sysent *callp;

		if (code >= NSYSCALL)
			callp = &nosys_ent;	/* nosys has no args */
		else
			callp = LWP_GETSYSENT(lwp) + code;
		(void) save_syscall_args();
		mutex_enter(&systrace_lock);
		printf("%d: ", p->p_pid);
		if (code >= NSYSCALL)
			printf("0x%x", code);
		else {
			sysname = mod_getsysname(code);
			printf("%s[0x%x/0x%p]", sysname == NULL ? "NULL" :
			    sysname, code, callp->sy_callc);
		}
		cp = "(";
		for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
			printf("%s%lx", cp, *ap);
			cp = ", ";
		}
		if (i)
			printf(")");
		printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
		mutex_exit(&systrace_lock);
	}
#endif /* SYSCALLTRACE */

	/*
	 * If there was a continuing reason for pre-syscall processing,
	 * set the t_pre_sys flag for the next system call.
	 */
	if (repost)
		t->t_pre_sys = 1;
	lwp->lwp_error = 0;	/* for old drivers */
	lwp->lwp_badpriv = PRIV_NONE;
	return (0);
}
Esempio n. 9
0
static int
copen(int startfd, char *fname, int filemode, int createmode)
{
	struct pathname pn;
	vnode_t *vp, *sdvp;
	file_t *fp, *startfp;
	enum vtype type;
	int error;
	int fd, dupfd;
	vnode_t *startvp;
	proc_t *p = curproc;

	if (startfd == AT_FDCWD) {
		/*
		 * Regular open()
		 */
		startvp = NULL;
	} else {
		/*
		 * We're here via openat()
		 */
		char startchar;

		if (copyin(fname, &startchar, sizeof (char)))
			return (set_errno(EFAULT));

		/*
		 * if startchar is / then startfd is ignored
		 */
		if (startchar == '/')
			startvp = NULL;
		else {
			if ((startfp = getf(startfd)) == NULL)
				return (set_errno(EBADF));
			startvp = startfp->f_vnode;
			VN_HOLD(startvp);
			releasef(startfd);
		}
	}

	if (filemode & FXATTR) {

		/*
		 * Make sure we have a valid request.
		 * We must either have a real fd or AT_FDCWD
		 */

		if (startfd != AT_FDCWD && startvp == NULL) {
			error = EINVAL;
			goto out;
		}

		if (error = pn_get(fname, UIO_USERSPACE, &pn)) {
			goto out;
		}

		if (startfd == AT_FDCWD) {
			mutex_enter(&p->p_lock);
			startvp = PTOU(p)->u_cdir;
			VN_HOLD(startvp);
			mutex_exit(&p->p_lock);
		}

		/*
		 * Verify permission to put attributes on file
		 */

		if ((VOP_ACCESS(startvp, VREAD, 0, CRED()) != 0) &&
		    (VOP_ACCESS(startvp, VWRITE, 0, CRED()) != 0) &&
		    (VOP_ACCESS(startvp, VEXEC, 0, CRED()) != 0)) {
			error = EACCES;
			pn_free(&pn);
			goto out;
		}

		if ((startvp->v_vfsp->vfs_flag & VFS_XATTR) != 0) {
			error = VOP_LOOKUP(startvp, "", &sdvp, &pn,
			    LOOKUP_XATTR|CREATE_XATTR_DIR, rootvp, CRED());
		} else {
			error = EINVAL;
		}
		pn_free(&pn);
		if (error != 0)
			goto out;

		VN_RELE(startvp);
		startvp = sdvp;
	}

	if ((filemode & (FREAD|FWRITE)) != 0) {
		if ((filemode & (FNONBLOCK|FNDELAY)) == (FNONBLOCK|FNDELAY))
			filemode &= ~FNDELAY;
		error = falloc((vnode_t *)NULL, filemode, &fp, &fd);
		if (error == 0) {
#ifdef C2_AUDIT
			if (audit_active)
				audit_setfsat_path(1);
#endif /* C2_AUDIT */
			/*
			 * Last arg is a don't-care term if
			 * !(filemode & FCREAT).
			 */
			error = vn_openat(fname, UIO_USERSPACE, filemode,
			    (int)(createmode & MODEMASK), &vp, CRCREAT,
			    u.u_cmask, startvp);

			if (startvp != NULL)
				VN_RELE(startvp);
			if (error == 0) {
#ifdef C2_AUDIT
				if (audit_active)
					audit_copen(fd, fp, vp);
#endif /* C2_AUDIT */
				if ((vp->v_flag & VDUP) == 0) {
					fp->f_vnode = vp;
					mutex_exit(&fp->f_tlock);
					/*
					 * We must now fill in the slot
					 * falloc reserved.
					 */
					setf(fd, fp);
					return (fd);
				} else {
					/*
					 * Special handling for /dev/fd.
					 * Give up the file pointer
					 * and dup the indicated file descriptor
					 * (in v_rdev). This is ugly, but I've
					 * seen worse.
					 */
					unfalloc(fp);
					dupfd = getminor(vp->v_rdev);
					type = vp->v_type;
					mutex_enter(&vp->v_lock);
					vp->v_flag &= ~VDUP;
					mutex_exit(&vp->v_lock);
					VN_RELE(vp);
					if (type != VCHR)
						return (set_errno(EINVAL));
					if ((fp = getf(dupfd)) == NULL) {
						setf(fd, NULL);
						return (set_errno(EBADF));
					}
					mutex_enter(&fp->f_tlock);
					fp->f_count++;
					mutex_exit(&fp->f_tlock);
					setf(fd, fp);
					releasef(dupfd);
				}
				return (fd);
			} else {
				setf(fd, NULL);
				unfalloc(fp);
				return (set_errno(error));
			}
		}
	} else {
		error = EINVAL;
	}
out:
	if (startvp != NULL)
		VN_RELE(startvp);
	return (set_errno(error));
}
Esempio n. 10
0
int
sendsig(int sig, k_siginfo_t *sip, void (*hdlr)())
{
	volatile int minstacksz;
	int newstack;
	label_t ljb;
	volatile caddr_t sp;
	caddr_t fp;
	volatile struct regs *rp;
	volatile greg_t upc;
	volatile proc_t *p = ttoproc(curthread);
	struct as *as = p->p_as;
	klwp_t *lwp = ttolwp(curthread);
	ucontext_t *volatile tuc = NULL;
	ucontext_t *uc;
	siginfo_t *sip_addr;
	volatile int watched;

	/*
	 * This routine is utterly dependent upon STACK_ALIGN being
	 * 16 and STACK_ENTRY_ALIGN being 8. Let's just acknowledge
	 * that and require it.
	 */

#if STACK_ALIGN != 16 || STACK_ENTRY_ALIGN != 8
#error "sendsig() amd64 did not find the expected stack alignments"
#endif

	rp = lwptoregs(lwp);
	upc = rp->r_pc;

	/*
	 * Since we're setting up to run the signal handler we have to
	 * arrange that the stack at entry to the handler is (only)
	 * STACK_ENTRY_ALIGN (i.e. 8) byte aligned so that when the handler
	 * executes its push of %rbp, the stack realigns to STACK_ALIGN
	 * (i.e. 16) correctly.
	 *
	 * The new sp will point to the sigframe and the ucontext_t. The
	 * above means that sp (and thus sigframe) will be 8-byte aligned,
	 * but not 16-byte aligned. ucontext_t, however, contains %xmm regs
	 * which must be 16-byte aligned. Because of this, for correct
	 * alignment, sigframe must be a multiple of 8-bytes in length, but
	 * not 16-bytes. This will place ucontext_t at a nice 16-byte boundary.
	 */

	/* LINTED: logical expression always true: op "||" */
	ASSERT((sizeof (struct sigframe) % 16) == 8);

	minstacksz = sizeof (struct sigframe) + SA(sizeof (*uc));
	if (sip != NULL)
		minstacksz += SA(sizeof (siginfo_t));
	ASSERT((minstacksz & (STACK_ENTRY_ALIGN - 1ul)) == 0);

	/*
	 * Figure out whether we will be handling this signal on
	 * an alternate stack specified by the user.  Then allocate
	 * and validate the stack requirements for the signal handler
	 * context.  on_fault will catch any faults.
	 */
	newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) &&
	    !(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE));

	if (newstack) {
		fp = (caddr_t)(SA((uintptr_t)lwp->lwp_sigaltstack.ss_sp) +
		    SA(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN);
	} else {
		/*
		 * Drop below the 128-byte reserved region of the stack frame
		 * we're interrupting.
		 */
		fp = (caddr_t)rp->r_sp - STACK_RESERVE;
	}

	/*
	 * Force proper stack pointer alignment, even in the face of a
	 * misaligned stack pointer from user-level before the signal.
	 */
	fp = (caddr_t)((uintptr_t)fp & ~(STACK_ENTRY_ALIGN - 1ul));

	/*
	 * Most of the time during normal execution, the stack pointer
	 * is aligned on a STACK_ALIGN (i.e. 16 byte) boundary.  However,
	 * (for example) just after a call instruction (which pushes
	 * the return address), the callers stack misaligns until the
	 * 'push %rbp' happens in the callee prolog.  So while we should
	 * expect the stack pointer to be always at least STACK_ENTRY_ALIGN
	 * aligned, we should -not- expect it to always be STACK_ALIGN aligned.
	 * We now adjust to ensure that the new sp is aligned to
	 * STACK_ENTRY_ALIGN but not to STACK_ALIGN.
	 */
	sp = fp - minstacksz;
	if (((uintptr_t)sp & (STACK_ALIGN - 1ul)) == 0) {
		sp -= STACK_ENTRY_ALIGN;
		minstacksz = fp - sp;
	}

	/*
	 * Now, make sure the resulting signal frame address is sane
	 */
	if (sp >= as->a_userlimit || fp >= as->a_userlimit) {
#ifdef DEBUG
		printf("sendsig: bad signal stack cmd=%s, pid=%d, sig=%d\n",
		    PTOU(p)->u_comm, p->p_pid, sig);
		printf("sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n",
		    (void *)sp, (void *)hdlr, (uintptr_t)upc);
		printf("sp above USERLIMIT\n");
#endif
		return (0);
	}

	watched = watch_disable_addr((caddr_t)sp, minstacksz, S_WRITE);

	if (on_fault(&ljb))
		goto badstack;

	if (sip != NULL) {
		zoneid_t zoneid;

		fp -= SA(sizeof (siginfo_t));
		uzero(fp, sizeof (siginfo_t));
		if (SI_FROMUSER(sip) &&
		    (zoneid = p->p_zone->zone_id) != GLOBAL_ZONEID &&
		    zoneid != sip->si_zoneid) {
			k_siginfo_t sani_sip = *sip;

			sani_sip.si_pid = p->p_zone->zone_zsched->p_pid;
			sani_sip.si_uid = 0;
			sani_sip.si_ctid = -1;
			sani_sip.si_zoneid = zoneid;
			copyout_noerr(&sani_sip, fp, sizeof (sani_sip));
		} else
			copyout_noerr(sip, fp, sizeof (*sip));
		sip_addr = (siginfo_t *)fp;

		if (sig == SIGPROF &&
		    curthread->t_rprof != NULL &&
		    curthread->t_rprof->rp_anystate) {
			/*
			 * We stand on our head to deal with
			 * the real time profiling signal.
			 * Fill in the stuff that doesn't fit
			 * in a normal k_siginfo structure.
			 */
			int i = sip->si_nsysarg;

			while (--i >= 0)
				sulword_noerr(
				    (ulong_t *)&(sip_addr->si_sysarg[i]),
				    (ulong_t)lwp->lwp_arg[i]);
			copyout_noerr(curthread->t_rprof->rp_state,
			    sip_addr->si_mstate,
			    sizeof (curthread->t_rprof->rp_state));
		}
	} else
		sip_addr = NULL;

	/*
	 * save the current context on the user stack directly after the
	 * sigframe. Since sigframe is 8-byte-but-not-16-byte aligned,
	 * and since sizeof (struct sigframe) is 24, this guarantees
	 * 16-byte alignment for ucontext_t and its %xmm registers.
	 */
	uc = (ucontext_t *)(sp + sizeof (struct sigframe));
	tuc = kmem_alloc(sizeof (*tuc), KM_SLEEP);
	no_fault();
	savecontext(tuc, &lwp->lwp_sigoldmask);
	if (on_fault(&ljb))
		goto badstack;
	copyout_noerr(tuc, uc, sizeof (*tuc));
	kmem_free(tuc, sizeof (*tuc));
	tuc = NULL;

	lwp->lwp_oldcontext = (uintptr_t)uc;

	if (newstack) {
		lwp->lwp_sigaltstack.ss_flags |= SS_ONSTACK;
		if (lwp->lwp_ustack)
			copyout_noerr(&lwp->lwp_sigaltstack,
			    (stack_t *)lwp->lwp_ustack, sizeof (stack_t));
	}

	/*
	 * Set up signal handler return and stack linkage
	 */
	{
		struct sigframe frame;

		/*
		 * ensure we never return "normally"
		 */
		frame.retaddr = (caddr_t)(uintptr_t)-1L;
		frame.signo = sig;
		frame.sip = sip_addr;
		copyout_noerr(&frame, sp, sizeof (frame));
	}

	no_fault();
	if (watched)
		watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE);

	/*
	 * Set up user registers for execution of signal handler.
	 */
	rp->r_sp = (greg_t)sp;
	rp->r_pc = (greg_t)hdlr;
	rp->r_ps = PSL_USER | (rp->r_ps & PS_IOPL);

	rp->r_rdi = sig;
	rp->r_rsi = (uintptr_t)sip_addr;
	rp->r_rdx = (uintptr_t)uc;

	if ((rp->r_cs & 0xffff) != UCS_SEL ||
	    (rp->r_ss & 0xffff) != UDS_SEL) {
		/*
		 * Try our best to deliver the signal.
		 */
		rp->r_cs = UCS_SEL;
		rp->r_ss = UDS_SEL;
	}

	/*
	 * Don't set lwp_eosys here.  sendsig() is called via psig() after
	 * lwp_eosys is handled, so setting it here would affect the next
	 * system call.
	 */
	return (1);

badstack:
	no_fault();
	if (watched)
		watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE);
	if (tuc)
		kmem_free(tuc, sizeof (*tuc));
#ifdef DEBUG
	printf("sendsig: bad signal stack cmd=%s, pid=%d, sig=%d\n",
	    PTOU(p)->u_comm, p->p_pid, sig);
	printf("on fault, sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n",
	    (void *)sp, (void *)hdlr, (uintptr_t)upc);
#endif
	return (0);
}
Esempio n. 11
0
/*
 * The additional flag, LOOKUP_CHECKREAD, is used to enforce artificial
 * constraints in order to be standards compliant.  For example, if we have
 * the cached path of '/foo/bar', and '/foo' has permissions 100 (execute
 * only), then we can legitimately look up the path to the current working
 * directory without needing read permission.  Existing standards tests,
 * however, assume that we are determining the path by repeatedly looking up
 * "..".  We need to keep this behavior in order to maintain backwards
 * compatibility.
 */
static int
vnodetopath_common(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen,
    cred_t *cr, int flags)
{
	pathname_t pn, rpn;
	int ret, len;
	vnode_t *compvp, *pvp, *realvp;
	proc_t *p = curproc;
	char path[MAXNAMELEN];
	int doclose = 0;

	/*
	 * If vrootp is NULL, get the root for curproc.  Callers with any other
	 * requirements should pass in a different vrootp.
	 */
	if (vrootp == NULL) {
		mutex_enter(&p->p_lock);
		if ((vrootp = PTOU(p)->u_rdir) == NULL)
			vrootp = rootdir;
		VN_HOLD(vrootp);
		mutex_exit(&p->p_lock);
	} else {
		VN_HOLD(vrootp);
	}

	/*
	 * This is to get around an annoying artifact of the /proc filesystem,
	 * which is the behavior of {cwd/root}.  Trying to resolve this path
	 * will result in /proc/pid/cwd instead of whatever the real working
	 * directory is.  We can't rely on VOP_REALVP(), since that will break
	 * lofs.  The only difference between procfs and lofs is that opening
	 * the file will return the underling vnode in the case of procfs.
	 */
	if (vp->v_type == VDIR && VOP_REALVP(vp, &realvp, NULL) == 0 &&
	    realvp != vp) {
		VN_HOLD(vp);
		if (VOP_OPEN(&vp, FREAD, cr, NULL) == 0)
			doclose = 1;
		else
			VN_RELE(vp);
	}

	pn_alloc(&pn);

	/*
	 * Check to see if we have a cached path in the vnode.
	 */
	mutex_enter(&vp->v_lock);
	if (vp->v_path != NULL) {
		(void) pn_set(&pn, vp->v_path);
		mutex_exit(&vp->v_lock);

		pn_alloc(&rpn);

		/* We should only cache absolute paths */
		ASSERT(pn.pn_buf[0] == '/');

		/*
		 * If we are in a zone or a chroot environment, then we have to
		 * take additional steps, since the path to the root might not
		 * be readable with the current credentials, even though the
		 * process can legitmately access the file.  In this case, we
		 * do the following:
		 *
		 * lookuppnvp() with all privileges to get the resolved path.
		 * call localpath() to get the local portion of the path, and
		 * continue as normal.
		 *
		 * If the the conversion to a local path fails, then we continue
		 * as normal.  This is a heuristic to make process object file
		 * paths available from within a zone.  Because lofs doesn't
		 * support page operations, the vnode stored in the seg_t is
		 * actually the underlying real vnode, not the lofs node itself.
		 * Most of the time, the lofs path is the same as the underlying
		 * vnode (for example, /usr/lib/libc.so.1).
		 */
		if (vrootp != rootdir) {
			char *local = NULL;
			VN_HOLD(rootdir);
			if (lookuppnvp(&pn, &rpn, FOLLOW,
			    NULL, &compvp, rootdir, rootdir, kcred) == 0) {
				local = localpath(rpn.pn_path, vrootp,
				    kcred);
				VN_RELE(compvp);
			}

			/*
			 * The original pn was changed through lookuppnvp().
			 * Set it to local for next validation attempt.
			 */
			if (local) {
				(void) pn_set(&pn, local);
			} else {
				goto notcached;
			}
		}

		/*
		 * We should have a local path at this point, so start the
		 * search from the root of the current process.
		 */
		VN_HOLD(vrootp);
		if (vrootp != rootdir)
			VN_HOLD(vrootp);
		ret = lookuppnvp(&pn, &rpn, FOLLOW | flags, NULL,
		    &compvp, vrootp, vrootp, cr);
		if (ret == 0) {
			/*
			 * Check to see if the returned vnode is the same as
			 * the one we expect.  If not, give up.
			 */
			if (!vn_compare(vp, compvp) &&
			    !vnode_match(vp, compvp, cr)) {
				VN_RELE(compvp);
				goto notcached;
			}

			VN_RELE(compvp);

			/*
			 * Return the result.
			 */
			if (buflen <= rpn.pn_pathlen)
				goto notcached;

			bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1);
			pn_free(&pn);
			pn_free(&rpn);
			VN_RELE(vrootp);
			if (doclose) {
				(void) VOP_CLOSE(vp, FREAD, 1, 0, cr, NULL);
				VN_RELE(vp);
			}
			return (0);
		}

notcached:
		pn_free(&rpn);
	} else {
		mutex_exit(&vp->v_lock);
	}

	pn_free(&pn);

	if (vp->v_type != VDIR) {
		/*
		 * If we don't have a directory, try to find it in the dnlc via
		 * reverse lookup.  Once this is found, we can use the regular
		 * directory search to find the full path.
		 */
		if ((pvp = dnlc_reverse_lookup(vp, path, MAXNAMELEN)) != NULL) {
			/*
			 * Check if we have read privilege so, that
			 * we can lookup the path in the directory
			 */
			ret = 0;
			if ((flags & LOOKUP_CHECKREAD)) {
				ret = VOP_ACCESS(pvp, VREAD, 0, cr, NULL);
			}
			if (ret == 0) {
				ret = dirtopath(vrootp, pvp, buf, buflen,
				    flags, cr);
			}
			if (ret == 0) {
				len = strlen(buf);
				if (len + strlen(path) + 1 >= buflen) {
					ret = ENAMETOOLONG;
				} else {
					if (buf[len - 1] != '/')
						buf[len++] = '/';
					bcopy(path, buf + len,
					    strlen(path) + 1);
				}
			}

			VN_RELE(pvp);
		} else
			ret = ENOENT;
	} else
		ret = dirtopath(vrootp, vp, buf, buflen, flags, cr);

	VN_RELE(vrootp);
	if (doclose) {
		(void) VOP_CLOSE(vp, FREAD, 1, 0, cr, NULL);
		VN_RELE(vp);
	}

	return (ret);
}
Esempio n. 12
0
void
main(void)
{
	proc_t		*p = ttoproc(curthread);	/* &p0 */
	int		(**initptr)();
	extern void	sched();
	extern void	fsflush();
	extern int	(*init_tbl[])();
	extern int	(*mp_init_tbl[])();
	extern id_t	syscid, defaultcid;
	extern int	swaploaded;
	extern int	netboot;
	extern ib_boot_prop_t *iscsiboot_prop;
	extern void	vm_init(void);
	extern void	cbe_init_pre(void);
	extern void	cbe_init(void);
	extern void	clock_tick_init_pre(void);
	extern void	clock_tick_init_post(void);
	extern void	clock_init(void);
	extern void	physio_bufs_init(void);
	extern void	pm_cfb_setup_intr(void);
	extern int	pm_adjust_timestamps(dev_info_t *, void *);
	extern void	start_other_cpus(int);
	extern void	sysevent_evc_thrinit();
	extern kmutex_t	ualock;
#if defined(__x86)
	extern void	fastboot_post_startup(void);
	extern void	progressbar_start(void);
#endif
	/*
	 * In the horrible world of x86 in-lines, you can't get symbolic
	 * structure offsets a la genassym.  This assertion is here so
	 * that the next poor slob who innocently changes the offset of
	 * cpu_thread doesn't waste as much time as I just did finding
	 * out that it's hard-coded in i86/ml/i86.il.  Similarly for
	 * curcpup.  You're welcome.
	 */
	ASSERT(CPU == CPU->cpu_self);
	ASSERT(curthread == CPU->cpu_thread);
	ASSERT_STACK_ALIGNED();

	/*
	 * We take the ualock until we have completed the startup
	 * to prevent kadmin() from disrupting this work. In particular,
	 * we don't want kadmin() to bring the system down while we are
	 * trying to start it up.
	 */
	mutex_enter(&ualock);

	/*
	 * Setup root lgroup and leaf lgroup for CPU 0
	 */
	lgrp_init(LGRP_INIT_STAGE2);

	/*
	 * Once 'startup()' completes, the thread_reaper() daemon would be
	 * created(in thread_init()). After that, it is safe to create threads
	 * that could exit. These exited threads will get reaped.
	 */
	startup();
	segkmem_gc();
	callb_init();
	cbe_init_pre();	/* x86 must initialize gethrtimef before timer_init */
	ddi_periodic_init();
	cbe_init();
	callout_init();	/* callout table MUST be init'd after cyclics */
	clock_tick_init_pre();
	clock_init();

#if defined(__x86)
	/*
	 * The progressbar thread uses cv_reltimedwait() and hence needs to be
	 * started after the callout mechanism has been initialized.
	 */
	progressbar_start();
#endif
	/*
	 * On some platforms, clkinitf() changes the timing source that
	 * gethrtime_unscaled() uses to generate timestamps.  cbe_init() calls
	 * clkinitf(), so re-initialize the microstate counters after the
	 * timesource has been chosen.
	 */
	init_mstate(&t0, LMS_SYSTEM);
	init_cpu_mstate(CPU, CMS_SYSTEM);

	/*
	 * May need to probe to determine latencies from CPU 0 after
	 * gethrtime() comes alive in cbe_init() and before enabling interrupts
	 * and copy and release any temporary memory allocated with BOP_ALLOC()
	 * before release_bootstrap() frees boot memory
	 */
	lgrp_init(LGRP_INIT_STAGE3);

	/*
	 * Call all system initialization functions.
	 */
	for (initptr = &init_tbl[0]; *initptr; initptr++)
		(**initptr)();
	/*
	 * Load iSCSI boot properties
	 */
	ld_ib_prop();
	/*
	 * initialize vm related stuff.
	 */
	vm_init();

	/*
	 * initialize buffer pool for raw I/O requests
	 */
	physio_bufs_init();

	ttolwp(curthread)->lwp_error = 0; /* XXX kludge for SCSI driver */

	/*
	 * Drop the interrupt level and allow interrupts.  At this point
	 * the DDI guarantees that interrupts are enabled.
	 */
	(void) spl0();
	interrupts_unleashed = 1;

	/*
	 * Create kmem cache for proc structures
	 */
	process_cache = kmem_cache_create("process_cache", sizeof (proc_t),
	    0, NULL, NULL, NULL, NULL, NULL, 0);

	vfs_mountroot();	/* Mount the root file system */
	errorq_init();		/* after vfs_mountroot() so DDI root is ready */
	cpu_kstat_init(CPU);	/* after vfs_mountroot() so TOD is valid */
	ddi_walk_devs(ddi_root_node(), pm_adjust_timestamps, NULL);
				/* after vfs_mountroot() so hrestime is valid */

	post_startup();
	swaploaded = 1;

	/*
	 * Initialize Solaris Audit Subsystem
	 */
	audit_init();

	/*
	 * Plumb the protocol modules and drivers only if we are not
	 * networked booted, in this case we already did it in rootconf().
	 */
	if (netboot == 0 && iscsiboot_prop == NULL)
		(void) strplumb();

	gethrestime(&PTOU(curproc)->u_start);
	curthread->t_start = PTOU(curproc)->u_start.tv_sec;
	p->p_mstart = gethrtime();

	/*
	 * Perform setup functions that can only be done after root
	 * and swap have been set up.
	 */
	consconfig();
#ifndef	__sparc
	release_bootstrap();
#endif

	/*
	 * attach drivers with ddi-forceattach prop
	 * It must be done early enough to load hotplug drivers (e.g.
	 * pcmcia nexus) so that devices enumerated via hotplug is
	 * available before I/O subsystem is fully initialized.
	 */
	i_ddi_forceattach_drivers();

	/*
	 * Set the scan rate and other parameters of the paging subsystem.
	 */
	setupclock(0);

	/*
	 * Initialize process 0's lwp directory and lwpid hash table.
	 */
	p->p_lwpdir = p->p_lwpfree = p0_lwpdir;
	p->p_lwpdir->ld_next = p->p_lwpdir + 1;
	p->p_lwpdir_sz = 2;
	p->p_tidhash = p0_tidhash;
	p->p_tidhash_sz = 2;
	p0_lep.le_thread = curthread;
	p0_lep.le_lwpid = curthread->t_tid;
	p0_lep.le_start = curthread->t_start;
	lwp_hash_in(p, &p0_lep, p0_tidhash, 2, 0);

	/*
	 * Initialize extended accounting.
	 */
	exacct_init();

	/*
	 * Initialize threads of sysevent event channels
	 */
	sysevent_evc_thrinit();

	/*
	 * This must be done after post_startup() but before
	 * start_other_cpus()
	 */
	lgrp_init(LGRP_INIT_STAGE4);

	/*
	 * Perform MP initialization, if any.
	 */
	start_other_cpus(0);

#ifdef	__sparc
	/*
	 * Release bootstrap here since PROM interfaces are
	 * used to start other CPUs above.
	 */
	release_bootstrap();
#endif

	/*
	 * Finish lgrp initialization after all CPUS are brought online.
	 */
	lgrp_init(LGRP_INIT_STAGE5);

	/*
	 * After mp_init(), number of cpus are known (this is
	 * true for the time being, when there are actually
	 * hot pluggable cpus then this scheme  would not do).
	 * Any per cpu initialization is done here.
	 */
	kmem_mp_init();
	vmem_update(NULL);

	clock_tick_init_post();

	for (initptr = &mp_init_tbl[0]; *initptr; initptr++)
		(**initptr)();

	/*
	 * These must be called after start_other_cpus
	 */
	pm_cfb_setup_intr();
#if defined(__x86)
	fastboot_post_startup();
#endif

	/*
	 * Make init process; enter scheduling loop with system process.
	 *
	 * Note that we manually assign the pids for these processes, for
	 * historical reasons.  If more pre-assigned pids are needed,
	 * FAMOUS_PIDS will have to be updated.
	 */

	/* create init process */
	if (newproc(start_init, NULL, defaultcid, 59, NULL,
	    FAMOUS_PID_INIT))
		panic("main: unable to fork init.");

	/* create pageout daemon */
	if (newproc(pageout, NULL, syscid, maxclsyspri - 1, NULL,
	    FAMOUS_PID_PAGEOUT))
		panic("main: unable to fork pageout()");

	/* create fsflush daemon */
	if (newproc(fsflush, NULL, syscid, minclsyspri, NULL,
	    FAMOUS_PID_FSFLUSH))
		panic("main: unable to fork fsflush()");

	/* create cluster process if we're a member of one */
	if (cluster_bootflags & CLUSTER_BOOTED) {
		if (newproc(cluster_wrapper, NULL, syscid, minclsyspri,
		    NULL, 0)) {
			panic("main: unable to fork cluster()");
		}
	}

	/*
	 * Create system threads (threads are associated with p0)
	 */

	/* create module uninstall daemon */
	/* BugID 1132273. If swapping over NFS need a bigger stack */
	(void) thread_create(NULL, 0, (void (*)())mod_uninstall_daemon,
	    NULL, 0, &p0, TS_RUN, minclsyspri);

	(void) thread_create(NULL, 0, seg_pasync_thread,
	    NULL, 0, &p0, TS_RUN, minclsyspri);

	pid_setmin();

	/* system is now ready */
	mutex_exit(&ualock);

	bcopy("sched", PTOU(curproc)->u_psargs, 6);
	bcopy("sched", PTOU(curproc)->u_comm, 5);
	sched();
	/* NOTREACHED */
}
Esempio n. 13
0
static int
copen(int startfd, char *fname, int filemode, int createmode)
{
	struct pathname pn;
	vnode_t *vp, *sdvp;
	file_t *fp, *startfp;
	enum vtype type;
	int error;
	int fd, dupfd;
	vnode_t *startvp;
	proc_t *p = curproc;
	uio_seg_t seg = UIO_USERSPACE;
	char *open_filename = fname;
	uint32_t auditing = AU_AUDITING();
	char startchar;

	if (filemode & (FSEARCH|FEXEC)) {
		/*
		 * Must be one or the other and neither FREAD nor FWRITE
		 * Must not be any of FAPPEND FCREAT FTRUNC FXATTR FXATTRDIROPEN
		 * XXX: Should these just be silently ignored?
		 */
		if ((filemode & (FREAD|FWRITE)) ||
		    (filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) ||
		    (filemode & (FAPPEND|FCREAT|FTRUNC|FXATTR|FXATTRDIROPEN)))
			return (set_errno(EINVAL));
	}

	if (startfd == AT_FDCWD) {
		/*
		 * Regular open()
		 */
		startvp = NULL;
	} else {
		/*
		 * We're here via openat()
		 */
		if (copyin(fname, &startchar, sizeof (char)))
			return (set_errno(EFAULT));

		/*
		 * if startchar is / then startfd is ignored
		 */
		if (startchar == '/')
			startvp = NULL;
		else {
			if ((startfp = getf(startfd)) == NULL)
				return (set_errno(EBADF));
			startvp = startfp->f_vnode;
			VN_HOLD(startvp);
			releasef(startfd);
		}
	}

	/*
	 * Handle __openattrdirat() requests
	 */
	if (filemode & FXATTRDIROPEN) {
		if (auditing && startvp != NULL)
			audit_setfsat_path(1);
		if (error = lookupnameat(fname, seg, FOLLOW,
		    NULLVPP, &vp, startvp))
			return (set_errno(error));
		if (startvp != NULL)
			VN_RELE(startvp);

		startvp = vp;
	}

	/*
	 * Do we need to go into extended attribute space?
	 */
	if (filemode & FXATTR) {
		if (startfd == AT_FDCWD) {
			if (copyin(fname, &startchar, sizeof (char)))
				return (set_errno(EFAULT));

			/*
			 * If startchar == '/' then no extended attributes
			 * are looked up.
			 */
			if (startchar == '/') {
				startvp = NULL;
			} else {
				mutex_enter(&p->p_lock);
				startvp = PTOU(p)->u_cdir;
				VN_HOLD(startvp);
				mutex_exit(&p->p_lock);
			}
		}

		/*
		 * Make sure we have a valid extended attribute request.
		 * We must either have a real fd or AT_FDCWD and a relative
		 * pathname.
		 */
		if (startvp == NULL) {
			goto noxattr;
		}
	}

	if (filemode & (FXATTR|FXATTRDIROPEN)) {
		vattr_t vattr;

		if (error = pn_get(fname, UIO_USERSPACE, &pn)) {
			goto out;
		}

		/*
		 * In order to access hidden attribute directory the
		 * user must be able to stat() the file
		 */
		vattr.va_mask = AT_ALL;
		if (error = VOP_GETATTR(startvp, &vattr, 0, CRED(), NULL)) {
			pn_free(&pn);
			goto out;
		}

		if ((startvp->v_vfsp->vfs_flag & VFS_XATTR) != 0 ||
		    vfs_has_feature(startvp->v_vfsp, VFSFT_SYSATTR_VIEWS)) {
			error = VOP_LOOKUP(startvp, "", &sdvp, &pn,
			    (filemode & FXATTRDIROPEN) ? LOOKUP_XATTR :
			    LOOKUP_XATTR|CREATE_XATTR_DIR, rootvp, CRED(),
			    NULL, NULL, NULL);
		} else {
			error = EINVAL;
		}

		/*
		 * For __openattrdirat() use "." as filename to open
		 * as part of vn_openat()
		 */
		if (error == 0 && (filemode & FXATTRDIROPEN)) {
			open_filename = ".";
			seg = UIO_SYSSPACE;
		}

		pn_free(&pn);
		if (error != 0)
			goto out;

		VN_RELE(startvp);
		startvp = sdvp;
	}

noxattr:
	if ((filemode & (FREAD|FWRITE|FSEARCH|FEXEC|FXATTRDIROPEN)) != 0) {
		if ((filemode & (FNONBLOCK|FNDELAY)) == (FNONBLOCK|FNDELAY))
			filemode &= ~FNDELAY;
		error = falloc((vnode_t *)NULL, filemode, &fp, &fd);
		if (error == 0) {
			if (auditing && startvp != NULL)
				audit_setfsat_path(1);
			/*
			 * Last arg is a don't-care term if
			 * !(filemode & FCREAT).
			 */
			error = vn_openat(open_filename, seg, filemode,
			    (int)(createmode & MODEMASK),
			    &vp, CRCREAT, PTOU(curproc)->u_cmask,
			    startvp, fd);

			if (startvp != NULL)
				VN_RELE(startvp);
			if (error == 0) {
				if ((vp->v_flag & VDUP) == 0) {
					fp->f_vnode = vp;
					mutex_exit(&fp->f_tlock);
					/*
					 * We must now fill in the slot
					 * falloc reserved.
					 */
					setf(fd, fp);
					return (fd);
				} else {
					/*
					 * Special handling for /dev/fd.
					 * Give up the file pointer
					 * and dup the indicated file descriptor
					 * (in v_rdev). This is ugly, but I've
					 * seen worse.
					 */
					unfalloc(fp);
					dupfd = getminor(vp->v_rdev);
					type = vp->v_type;
					mutex_enter(&vp->v_lock);
					vp->v_flag &= ~VDUP;
					mutex_exit(&vp->v_lock);
					VN_RELE(vp);
					if (type != VCHR)
						return (set_errno(EINVAL));
					if ((fp = getf(dupfd)) == NULL) {
						setf(fd, NULL);
						return (set_errno(EBADF));
					}
					mutex_enter(&fp->f_tlock);
					fp->f_count++;
					mutex_exit(&fp->f_tlock);
					setf(fd, fp);
					releasef(dupfd);
				}
				return (fd);
			} else {
				setf(fd, NULL);
				unfalloc(fp);
				return (set_errno(error));
			}
		}
	} else {
		error = EINVAL;
	}
out:
	if (startvp != NULL)
		VN_RELE(startvp);
	return (set_errno(error));
}
Esempio n. 14
0
/*
 * setppriv (priv_op_t, priv_ptype_t, priv_set_t)
 */
static int
setppriv(priv_op_t op, priv_ptype_t type, priv_set_t *in_pset)
{
	priv_set_t	pset, *target;
	cred_t		*cr, *pcr;
	proc_t		*p;
	boolean_t	donocd = B_FALSE;

	if (!PRIV_VALIDSET(type) || !PRIV_VALIDOP(op))
		return (set_errno(EINVAL));

	if (copyin(in_pset, &pset, sizeof (priv_set_t)))
		return (set_errno(EFAULT));

	p = ttoproc(curthread);
	cr = cralloc();
	mutex_enter(&p->p_crlock);

retry:
	pcr = p->p_cred;

	if (AU_AUDITING())
		audit_setppriv(op, type, &pset, pcr);

	/*
	 * Filter out unallowed request (bad op and bad type)
	 */
	switch (op) {
	case PRIV_ON:
	case PRIV_SET:
		/*
		 * Turning on privileges; the limit set cannot grow,
		 * other sets can but only as long as they remain subsets
		 * of P.  Only immediately after exec holds that P <= L.
		 */
		if (type == PRIV_LIMIT &&
		    !priv_issubset(&pset, &CR_LPRIV(pcr))) {
			mutex_exit(&p->p_crlock);
			crfree(cr);
			return (set_errno(EPERM));
		}
		if (!priv_issubset(&pset, &CR_OPPRIV(pcr)) &&
		    !priv_issubset(&pset, priv_getset(pcr, type))) {
			mutex_exit(&p->p_crlock);
			/* Policy override should not grow beyond L either */
			if (type != PRIV_INHERITABLE ||
			    !priv_issubset(&pset, &CR_LPRIV(pcr)) ||
			    secpolicy_require_privs(CRED(), &pset) != 0) {
				crfree(cr);
				return (set_errno(EPERM));
			}
			mutex_enter(&p->p_crlock);
			if (pcr != p->p_cred)
				goto retry;
			donocd = B_TRUE;
		}
		break;

	case PRIV_OFF:
		/* PRIV_OFF is always allowed */
		break;
	}

	/*
	 * OK! everything is cool.
	 * Do cred COW.
	 */
	crcopy_to(pcr, cr);

	/*
	 * If we change the effective, permitted or limit set, we attain
	 * "privilege awareness".
	 */
	if (type != PRIV_INHERITABLE)
		priv_set_PA(cr);

	target = &(CR_PRIVS(cr)->crprivs[type]);

	switch (op) {
	case PRIV_ON:
		priv_union(&pset, target);
		break;
	case PRIV_OFF:
		priv_inverse(&pset);
		priv_intersect(target, &pset);

		/*
		 * Fall-thru to set target and change other process
		 * privilege sets.
		 */
		/*FALLTHRU*/

	case PRIV_SET:
		*target = pset;

		/*
		 * Take privileges no longer permitted out
		 * of other effective sets as well.
		 * Limit set is enforced at exec() time.
		 */
		if (type == PRIV_PERMITTED)
			priv_intersect(&pset, &CR_EPRIV(cr));
		break;
	}

	/*
	 * When we give up privileges not in the inheritable set,
	 * set SNOCD if not already set; first we compute the
	 * privileges removed from P using Diff = (~P') & P
	 * and then we check whether the removed privileges are
	 * a subset of I.  If we retain uid 0, all privileges
	 * are required anyway so don't set SNOCD.
	 */
	if (type == PRIV_PERMITTED && (p->p_flag & SNOCD) == 0 &&
	    cr->cr_uid != 0 && cr->cr_ruid != 0 && cr->cr_suid != 0) {
		priv_set_t diff = CR_OPPRIV(cr);
		priv_inverse(&diff);
		priv_intersect(&CR_OPPRIV(pcr), &diff);
		donocd = !priv_issubset(&diff, &CR_IPRIV(cr));
	}

	p->p_cred = cr;
	mutex_exit(&p->p_crlock);

	if (donocd) {
		mutex_enter(&p->p_lock);
		p->p_flag |= SNOCD;
		mutex_exit(&p->p_lock);
	}

	/*
	 * The basic_test privilege should not be removed from E;
	 * if that has happened, then some programmer typically set the E/P to
	 * empty. That is not portable.
	 */
	if ((type == PRIV_EFFECTIVE || type == PRIV_PERMITTED) &&
	    priv_basic_test >= 0 && !PRIV_ISASSERT(target, priv_basic_test)) {
		proc_t *p = curproc;
		pid_t pid = p->p_pid;
		char *fn = PTOU(p)->u_comm;

		cmn_err(CE_WARN, "%s[%d]: setppriv: basic_test privilege "
		    "removed from E/P", fn, pid);
	}

	crset(p, cr);		/* broadcast to process threads */

	return (0);
}
Esempio n. 15
0
int
dogetcwd(char *buf, size_t buflen)
{
	int ret;
	vnode_t *vp;
	vnode_t *compvp;
	refstr_t *cwd, *oldcwd;
	const char *value;
	pathname_t rpnp, pnp;
	proc_t *p = curproc;

	/*
	 * Check to see if there is a cached version of the cwd.  If so, lookup
	 * the cached value and make sure it is the same vnode.
	 */
	mutex_enter(&p->p_lock);
	if ((cwd = PTOU(p)->u_cwd) != NULL)
		refstr_hold(cwd);
	vp = PTOU(p)->u_cdir;
	VN_HOLD(vp);
	mutex_exit(&p->p_lock);

	/*
	 * Make sure we have permission to access the current directory.
	 */
	if ((ret = VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) != 0) {
		if (cwd != NULL)
			refstr_rele(cwd);
		VN_RELE(vp);
		return (ret);
	}

	if (cwd) {
		value = refstr_value(cwd);
		if ((ret = pn_get((char *)value, UIO_SYSSPACE, &pnp)) != 0) {
			refstr_rele(cwd);
			VN_RELE(vp);
			return (ret);
		}

		pn_alloc(&rpnp);

		if (lookuppn(&pnp, &rpnp, NO_FOLLOW, NULL, &compvp) == 0) {

			if (VN_CMP(vp, compvp) &&
			    strcmp(value, rpnp.pn_path) == 0) {
				VN_RELE(compvp);
				VN_RELE(vp);
				pn_free(&pnp);
				pn_free(&rpnp);
				if (strlen(value) + 1 > buflen) {
					refstr_rele(cwd);
					return (ENAMETOOLONG);
				}
				bcopy(value, buf, strlen(value) + 1);
				refstr_rele(cwd);
				return (0);
			}

			VN_RELE(compvp);
		}

		pn_free(&rpnp);
		pn_free(&pnp);

		refstr_rele(cwd);
	}

	ret = vnodetopath_common(NULL, vp, buf, buflen, CRED(),
	    LOOKUP_CHECKREAD);

	VN_RELE(vp);

	/*
	 * Store the new cwd and replace the existing cached copy.
	 */
	if (ret == 0)
		cwd = refstr_alloc(buf);
	else
		cwd = NULL;

	mutex_enter(&p->p_lock);
	oldcwd = PTOU(p)->u_cwd;
	PTOU(p)->u_cwd = cwd;
	mutex_exit(&p->p_lock);

	if (oldcwd)
		refstr_rele(oldcwd);

	return (ret);
}
Esempio n. 16
0
int
kadmin(int cmd, int fcn, void *mdep, cred_t *credp)
{
	int error = 0;
	char *buf;
	size_t buflen = 0;
	boolean_t invoke_cb = B_FALSE;

	/*
	 * We might be called directly by the kernel's fault-handling code, so
	 * we can't assert that the caller is in the global zone.
	 */

	/*
	 * Make sure that cmd is one of the valid <sys/uadmin.h> command codes
	 * and that we have appropriate privileges for this action.
	 */
	switch (cmd) {
	case A_FTRACE:
	case A_SHUTDOWN:
	case A_REBOOT:
	case A_REMOUNT:
	case A_FREEZE:
	case A_DUMP:
	case A_SDTTEST:
	case A_CONFIG:
		if (secpolicy_sys_config(credp, B_FALSE) != 0)
			return (EPERM);
		break;

	default:
		return (EINVAL);
	}

	/*
	 * Serialize these operations on ualock.  If it is held, the
	 * system should shutdown, reboot, or remount shortly, unless there is
	 * an error.  We need a cv rather than just a mutex because proper
	 * functioning of A_REBOOT relies on being able to interrupt blocked
	 * userland callers.
	 *
	 * We only clear ua_shutdown_thread after A_REMOUNT or A_CONFIG.
	 * Other commands should never return.
	 */
	if (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_REMOUNT ||
	    cmd == A_CONFIG) {
		mutex_enter(&ualock);
		while (ua_shutdown_thread != NULL) {
			if (cv_wait_sig(&uacond, &ualock) == 0) {
				/*
				 * If we were interrupted, leave, and handle
				 * the signal (or exit, depending on what
				 * happened)
				 */
				mutex_exit(&ualock);
				return (EINTR);
			}
		}
		ua_shutdown_thread = curthread;
		mutex_exit(&ualock);
	}

	switch (cmd) {
	case A_SHUTDOWN:
	{
		proc_t *p = ttoproc(curthread);

		/*
		 * Release (almost) all of our own resources if we are called
		 * from a user context, however if we are calling kadmin() from
		 * a kernel context then we do not release these resources.
		 */
		if (p != &p0) {
			proc_is_exiting(p);
			if ((error = exitlwps(0)) != 0) {
				/*
				 * Another thread in this process also called
				 * exitlwps().
				 */
				mutex_enter(&ualock);
				ua_shutdown_thread = NULL;
				cv_signal(&uacond);
				mutex_exit(&ualock);
				return (error);
			}
			mutex_enter(&p->p_lock);
			p->p_flag |= SNOWAIT;
			sigfillset(&p->p_ignore);
			curthread->t_lwp->lwp_cursig = 0;
			curthread->t_lwp->lwp_extsig = 0;
			if (p->p_exec) {
				vnode_t *exec_vp = p->p_exec;
				p->p_exec = NULLVP;
				mutex_exit(&p->p_lock);
				VN_RELE(exec_vp);
			} else {
				mutex_exit(&p->p_lock);
			}

			pollcleanup();
			closeall(P_FINFO(curproc));
			relvm();

		} else {
			/*
			 * Reset t_cred if not set because much of the
			 * filesystem code depends on CRED() being valid.
			 */
			if (curthread->t_cred == NULL)
				curthread->t_cred = kcred;
		}

		/* indicate shutdown in progress */
		sys_shutdown = 1;

		/*
		 * Communcate that init shouldn't be restarted.
		 */
		zone_shutdown_global();

		killall(ALL_ZONES);
		/*
		 * If we are calling kadmin() from a kernel context then we
		 * do not release these resources.
		 */
		if (ttoproc(curthread) != &p0) {
			VN_RELE(PTOU(curproc)->u_cdir);
			if (PTOU(curproc)->u_rdir)
				VN_RELE(PTOU(curproc)->u_rdir);
			if (PTOU(curproc)->u_cwd)
				refstr_rele(PTOU(curproc)->u_cwd);

			PTOU(curproc)->u_cdir = rootdir;
			PTOU(curproc)->u_rdir = NULL;
			PTOU(curproc)->u_cwd = NULL;
		}

		/*
		 * Allow the reboot/halt/poweroff code a chance to do
		 * anything it needs to whilst we still have filesystems
		 * mounted, like loading any modules necessary for later
		 * performing the actual poweroff.
		 */
		if ((mdep != NULL) && (*(char *)mdep == '/')) {
			buf = i_convert_boot_device_name(mdep, NULL, &buflen);
			mdpreboot(cmd, fcn, buf);
		} else
			mdpreboot(cmd, fcn, mdep);

		/*
		 * Allow fsflush to finish running and then prevent it
		 * from ever running again so that vfs_unmountall() and
		 * vfs_syncall() can acquire the vfs locks they need.
		 */
		sema_p(&fsflush_sema);
		(void) callb_execute_class(CB_CL_UADMIN_PRE_VFS, NULL);

		vfs_unmountall();
		(void) VFS_MOUNTROOT(rootvfs, ROOT_UNMOUNT);
		vfs_syncall();

		dump_ereports();
		dump_messages();

		invoke_cb = B_TRUE;

		/* FALLTHROUGH */
	}

	case A_REBOOT:
		if ((mdep != NULL) && (*(char *)mdep == '/')) {
			buf = i_convert_boot_device_name(mdep, NULL, &buflen);
			mdboot(cmd, fcn, buf, invoke_cb);
		} else
			mdboot(cmd, fcn, mdep, invoke_cb);
		/* no return expected */
		break;

	case A_CONFIG:
		switch (fcn) {
		case AD_UPDATE_BOOT_CONFIG:
#ifndef	__sparc
		{
			extern void fastboot_update_config(const char *);

			fastboot_update_config(mdep);
		}
#endif

			break;
		}
		/* Let other threads enter the shutdown path now */
		mutex_enter(&ualock);
		ua_shutdown_thread = NULL;
		cv_signal(&uacond);
		mutex_exit(&ualock);
		break;

	case A_REMOUNT:
		(void) VFS_MOUNTROOT(rootvfs, ROOT_REMOUNT);
		/* Let other threads enter the shutdown path now */
		mutex_enter(&ualock);
		ua_shutdown_thread = NULL;
		cv_signal(&uacond);
		mutex_exit(&ualock);
		break;

	case A_FREEZE:
	{
		/*
		 * This is the entrypoint for all suspend/resume actions.
		 */
		extern int cpr(int, void *);

		if (modload("misc", "cpr") == -1)
			return (ENOTSUP);
		/* Let the CPR module decide what to do with mdep */
		error = cpr(fcn, mdep);
		break;
	}

	case A_FTRACE:
	{
		switch (fcn) {
		case AD_FTRACE_START:
			(void) FTRACE_START();
			break;
		case AD_FTRACE_STOP:
			(void) FTRACE_STOP();
			break;
		default:
			error = EINVAL;
		}
		break;
	}

	case A_DUMP:
	{
		if (fcn == AD_NOSYNC) {
			in_sync = 1;
			break;
		}

		panic_bootfcn = fcn;
		panic_forced = 1;

		if ((mdep != NULL) && (*(char *)mdep == '/')) {
			panic_bootstr = i_convert_boot_device_name(mdep,
			    NULL, &buflen);
		} else
			panic_bootstr = mdep;

#ifndef	__sparc
		extern void fastboot_update_and_load(int, char *);

		fastboot_update_and_load(fcn, mdep);
#endif

		panic("forced crash dump initiated at user request");
		/*NOTREACHED*/
	}

	case A_SDTTEST:
	{
		DTRACE_PROBE7(test, int, 1, int, 2, int, 3, int, 4, int, 5,
		    int, 6, int, 7);
		break;
	}

	default:
		error = EINVAL;
	}

	return (error);
}
Esempio n. 17
0
/*
 * Remove zombie children from the process table.
 */
void
freeproc(proc_t *p)
{
	proc_t *q;
	task_t *tk;

	ASSERT(p->p_stat == SZOMB);
	ASSERT(p->p_tlist == NULL);
	ASSERT(MUTEX_HELD(&pidlock));

	sigdelq(p, NULL, 0);
	if (p->p_killsqp) {
		siginfofree(p->p_killsqp);
		p->p_killsqp = NULL;
	}

	prfree(p);	/* inform /proc */

	/*
	 * Don't free the init processes.
	 * Other dying processes will access it.
	 */
	if (p == proc_init)
		return;


	/*
	 * We wait until now to free the cred structure because a
	 * zombie process's credentials may be examined by /proc.
	 * No cred locking needed because there are no threads at this point.
	 */
	upcount_dec(crgetruid(p->p_cred), crgetzoneid(p->p_cred));
	crfree(p->p_cred);
	if (p->p_corefile != NULL) {
		corectl_path_rele(p->p_corefile);
		p->p_corefile = NULL;
	}
	if (p->p_content != NULL) {
		corectl_content_rele(p->p_content);
		p->p_content = NULL;
	}

	if (p->p_nextofkin && !((p->p_nextofkin->p_flag & SNOWAIT) ||
	    (PTOU(p->p_nextofkin)->u_signal[SIGCLD - 1] == SIG_IGN))) {
		/*
		 * This should still do the right thing since p_utime/stime
		 * get set to the correct value on process exit, so it
		 * should get properly updated
		 */
		p->p_nextofkin->p_cutime += p->p_utime;
		p->p_nextofkin->p_cstime += p->p_stime;

		p->p_nextofkin->p_cacct[LMS_USER] += p->p_acct[LMS_USER];
		p->p_nextofkin->p_cacct[LMS_SYSTEM] += p->p_acct[LMS_SYSTEM];
		p->p_nextofkin->p_cacct[LMS_TRAP] += p->p_acct[LMS_TRAP];
		p->p_nextofkin->p_cacct[LMS_TFAULT] += p->p_acct[LMS_TFAULT];
		p->p_nextofkin->p_cacct[LMS_DFAULT] += p->p_acct[LMS_DFAULT];
		p->p_nextofkin->p_cacct[LMS_KFAULT] += p->p_acct[LMS_KFAULT];
		p->p_nextofkin->p_cacct[LMS_USER_LOCK]
		    += p->p_acct[LMS_USER_LOCK];
		p->p_nextofkin->p_cacct[LMS_SLEEP] += p->p_acct[LMS_SLEEP];
		p->p_nextofkin->p_cacct[LMS_WAIT_CPU]
		    += p->p_acct[LMS_WAIT_CPU];
		p->p_nextofkin->p_cacct[LMS_STOPPED] += p->p_acct[LMS_STOPPED];

		p->p_nextofkin->p_cru.minflt	+= p->p_ru.minflt;
		p->p_nextofkin->p_cru.majflt	+= p->p_ru.majflt;
		p->p_nextofkin->p_cru.nswap	+= p->p_ru.nswap;
		p->p_nextofkin->p_cru.inblock	+= p->p_ru.inblock;
		p->p_nextofkin->p_cru.oublock	+= p->p_ru.oublock;
		p->p_nextofkin->p_cru.msgsnd	+= p->p_ru.msgsnd;
		p->p_nextofkin->p_cru.msgrcv	+= p->p_ru.msgrcv;
		p->p_nextofkin->p_cru.nsignals	+= p->p_ru.nsignals;
		p->p_nextofkin->p_cru.nvcsw	+= p->p_ru.nvcsw;
		p->p_nextofkin->p_cru.nivcsw	+= p->p_ru.nivcsw;
		p->p_nextofkin->p_cru.sysc	+= p->p_ru.sysc;
		p->p_nextofkin->p_cru.ioch	+= p->p_ru.ioch;

	}

	q = p->p_nextofkin;
	if (q && q->p_orphan == p)
		q->p_orphan = p->p_nextorph;
	else if (q) {
		for (q = q->p_orphan; q; q = q->p_nextorph)
			if (q->p_nextorph == p)
				break;
		ASSERT(q && q->p_nextorph == p);
		q->p_nextorph = p->p_nextorph;
	}

	/*
	 * The process table slot is being freed, so it is now safe to give up
	 * task and project membership.
	 */
	mutex_enter(&p->p_lock);
	tk = p->p_task;
	task_detach(p);
	mutex_exit(&p->p_lock);

	proc_detach(p);
	pid_exit(p, tk);	/* frees pid and proc structure */

	task_rele(tk);
}
Esempio n. 18
0
int
sigaction(int sig, struct sigaction *actp, struct sigaction *oactp)
{
	struct sigaction act;
	struct sigaction oact;
	k_sigset_t set;
	proc_t *p;
	user_t *ua;
	int sigcld_look = 0;

	if (sig <= 0 || sig >= NSIG ||
	    (actp != NULL && sigismember(&cantmask, sig)))
		return (set_errno(EINVAL));

	/*
	 * act and oact might be the same address, so copyin act first.
	 */
	if (actp) {
#if defined(__sparc)
		void (*handler)();
#endif
		if (copyin(actp, &act, sizeof (act)))
			return (set_errno(EFAULT));
#if defined(__sparc)
		/*
		 * Check alignment of handler
		 */
		handler = act.sa_handler;
		if (handler != SIG_IGN && handler != SIG_DFL &&
		    ((uintptr_t)handler & 0x3) != 0)
			return (set_errno(EINVAL));
#endif
	}

	p = curproc;
	ua = PTOU(p);
	mutex_enter(&p->p_lock);

	if (oactp) {
		int flags;
		void (*disp)();

		disp = ua->u_signal[sig - 1];

		flags = 0;
		if (disp != SIG_DFL && disp != SIG_IGN) {
			set = ua->u_sigmask[sig-1];
			if (sigismember(&p->p_siginfo, sig))
				flags |= SA_SIGINFO;
			if (sigismember(&ua->u_sigrestart, sig))
				flags |= SA_RESTART;
			if (sigismember(&ua->u_sigonstack, sig))
				flags |= SA_ONSTACK;
			if (sigismember(&ua->u_sigresethand, sig))
				flags |= SA_RESETHAND;
			if (sigismember(&ua->u_signodefer, sig))
				flags |= SA_NODEFER;
		} else
			sigemptyset(&set);

		if (sig == SIGCLD) {
			if (p->p_flag & SNOWAIT)
				flags |= SA_NOCLDWAIT;
			if (!(p->p_flag & SJCTL))
				flags |= SA_NOCLDSTOP;
		}

		oact.sa_handler = disp;
		oact.sa_flags = flags;
		sigktou(&set, &oact.sa_mask);
	}

	if (actp) {
		if (sig == SIGCLD)
			sigcld_look = 1;
		sigutok(&act.sa_mask, &set);
		setsigact(sig, act.sa_handler, &set, act.sa_flags);
	}

	mutex_exit(&p->p_lock);

	if (sigcld_look)
		sigcld_repost();

	if (oactp &&
	    copyout(&oact, oactp, sizeof (oact)))
		return (set_errno(EFAULT));

	return (0);
}
Esempio n. 19
0
/*
 * Called by proc_exit() when a zone's init exits, presumably because
 * it failed.  As long as the given zone is still in the "running"
 * state, we will re-exec() init, but first we need to reset things
 * which are usually inherited across exec() but will break init's
 * assumption that it is being exec()'d from a virgin process.  Most
 * importantly this includes closing all file descriptors (exec only
 * closes those marked close-on-exec) and resetting signals (exec only
 * resets handled signals, and we need to clear any signals which
 * killed init).  Anything else that exec(2) says would be inherited,
 * but would affect the execution of init, needs to be reset.
 */
static int
restart_init(int what, int why)
{
	kthread_t *t = curthread;
	klwp_t *lwp = ttolwp(t);
	proc_t *p = ttoproc(t);
	user_t *up = PTOU(p);

	vnode_t *oldcd, *oldrd;
	int i, err;
	char reason_buf[64];

	/*
	 * Let zone admin (and global zone admin if this is for a non-global
	 * zone) know that init has failed and will be restarted.
	 */
	zcmn_err(p->p_zone->zone_id, CE_WARN,
	    "init(1M) %s: restarting automatically",
	    exit_reason(reason_buf, sizeof (reason_buf), what, why));

	if (!INGLOBALZONE(p)) {
		cmn_err(CE_WARN, "init(1M) for zone %s (pid %d) %s: "
		    "restarting automatically",
		    p->p_zone->zone_name, p->p_pid, reason_buf);
	}

	/*
	 * Remove any fpollinfo_t's for this (last) thread from our file
	 * descriptors so closeall() can ASSERT() that they're all gone.
	 * Then close all open file descriptors in the process.
	 */
	pollcleanup();
	closeall(P_FINFO(p));

	/*
	 * Grab p_lock and begin clearing miscellaneous global process
	 * state that needs to be reset before we exec the new init(1M).
	 */

	mutex_enter(&p->p_lock);
	prbarrier(p);

	p->p_flag &= ~(SKILLED | SEXTKILLED | SEXITING | SDOCORE);
	up->u_cmask = CMASK;

	sigemptyset(&t->t_hold);
	sigemptyset(&t->t_sig);
	sigemptyset(&t->t_extsig);

	sigemptyset(&p->p_sig);
	sigemptyset(&p->p_extsig);

	sigdelq(p, t, 0);
	sigdelq(p, NULL, 0);

	if (p->p_killsqp) {
		siginfofree(p->p_killsqp);
		p->p_killsqp = NULL;
	}

	/*
	 * Reset any signals that are ignored back to the default disposition.
	 * Other u_signal members will be cleared when exec calls sigdefault().
	 */
	for (i = 1; i < NSIG; i++) {
		if (up->u_signal[i - 1] == SIG_IGN) {
			up->u_signal[i - 1] = SIG_DFL;
			sigemptyset(&up->u_sigmask[i - 1]);
		}
	}

	/*
	 * Clear the current signal, any signal info associated with it, and
	 * any signal information from contracts and/or contract templates.
	 */
	lwp->lwp_cursig = 0;
	lwp->lwp_extsig = 0;
	if (lwp->lwp_curinfo != NULL) {
		siginfofree(lwp->lwp_curinfo);
		lwp->lwp_curinfo = NULL;
	}
	lwp_ctmpl_clear(lwp);

	/*
	 * Reset both the process root directory and the current working
	 * directory to the root of the zone just as we do during boot.
	 */
	VN_HOLD(p->p_zone->zone_rootvp);
	oldrd = up->u_rdir;
	up->u_rdir = p->p_zone->zone_rootvp;

	VN_HOLD(p->p_zone->zone_rootvp);
	oldcd = up->u_cdir;
	up->u_cdir = p->p_zone->zone_rootvp;

	if (up->u_cwd != NULL) {
		refstr_rele(up->u_cwd);
		up->u_cwd = NULL;
	}

	mutex_exit(&p->p_lock);

	if (oldrd != NULL)
		VN_RELE(oldrd);
	if (oldcd != NULL)
		VN_RELE(oldcd);

	/* Free the controlling tty.  (freectty() always assumes curproc.) */
	ASSERT(p == curproc);
	(void) freectty(B_TRUE);

	/*
	 * Now exec() the new init(1M) on top of the current process.  If we
	 * succeed, the caller will treat this like a successful system call.
	 * If we fail, we issue messages and the caller will proceed with exit.
	 */
	err = exec_init(p->p_zone->zone_initname, NULL);

	if (err == 0)
		return (0);

	zcmn_err(p->p_zone->zone_id, CE_WARN,
	    "failed to restart init(1M) (err=%d): system reboot required", err);

	if (!INGLOBALZONE(p)) {
		cmn_err(CE_WARN, "failed to restart init(1M) for zone %s "
		    "(pid %d, err=%d): zoneadm(1M) boot required",
		    p->p_zone->zone_name, p->p_pid, err);
	}

	return (-1);
}
Esempio n. 20
0
int
sendsig(int sig, k_siginfo_t *sip, void (*hdlr)())
{
	volatile int minstacksz;
	int newstack;
	label_t ljb;
	volatile caddr_t sp;
	caddr_t fp;
	struct regs *rp;
	volatile greg_t upc;
	volatile proc_t *p = ttoproc(curthread);
	klwp_t *lwp = ttolwp(curthread);
	ucontext_t *volatile tuc = NULL;
	ucontext_t *uc;
	siginfo_t *sip_addr;
	volatile int watched;

	rp = lwptoregs(lwp);
	upc = rp->r_pc;

	minstacksz = SA(sizeof (struct sigframe)) + SA(sizeof (*uc));
	if (sip != NULL)
		minstacksz += SA(sizeof (siginfo_t));
	ASSERT((minstacksz & (STACK_ALIGN - 1ul)) == 0);

	/*
	 * Figure out whether we will be handling this signal on
	 * an alternate stack specified by the user. Then allocate
	 * and validate the stack requirements for the signal handler
	 * context. on_fault will catch any faults.
	 */
	newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) &&
	    !(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE));

	if (newstack) {
		fp = (caddr_t)(SA((uintptr_t)lwp->lwp_sigaltstack.ss_sp) +
		    SA(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN);
	} else if ((rp->r_ss & 0xffff) != UDS_SEL) {
		user_desc_t *ldt;
		/*
		 * If the stack segment selector is -not- pointing at
		 * the UDS_SEL descriptor and we have an LDT entry for
		 * it instead, add the base address to find the effective va.
		 */
		if ((ldt = p->p_ldt) != NULL)
			fp = (caddr_t)rp->r_sp +
			    USEGD_GETBASE(&ldt[SELTOIDX(rp->r_ss)]);
		else
			fp = (caddr_t)rp->r_sp;
	} else
		fp = (caddr_t)rp->r_sp;

	/*
	 * Force proper stack pointer alignment, even in the face of a
	 * misaligned stack pointer from user-level before the signal.
	 * Don't use the SA() macro because that rounds up, not down.
	 */
	fp = (caddr_t)((uintptr_t)fp & ~(STACK_ALIGN - 1ul));
	sp = fp - minstacksz;

	/*
	 * Make sure lwp hasn't trashed its stack.
	 */
	if (sp >= (caddr_t)USERLIMIT || fp >= (caddr_t)USERLIMIT) {
#ifdef DEBUG
		printf("sendsig: bad signal stack cmd=%s, pid=%d, sig=%d\n",
		    PTOU(p)->u_comm, p->p_pid, sig);
		printf("sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n",
		    (void *)sp, (void *)hdlr, (uintptr_t)upc);
		printf("sp above USERLIMIT\n");
#endif
		return (0);
	}

	watched = watch_disable_addr((caddr_t)sp, minstacksz, S_WRITE);

	if (on_fault(&ljb))
		goto badstack;

	if (sip != NULL) {
		zoneid_t zoneid;

		fp -= SA(sizeof (siginfo_t));
		uzero(fp, sizeof (siginfo_t));
		if (SI_FROMUSER(sip) &&
		    (zoneid = p->p_zone->zone_id) != GLOBAL_ZONEID &&
		    zoneid != sip->si_zoneid) {
			k_siginfo_t sani_sip = *sip;

			sani_sip.si_pid = p->p_zone->zone_zsched->p_pid;
			sani_sip.si_uid = 0;
			sani_sip.si_ctid = -1;
			sani_sip.si_zoneid = zoneid;
			copyout_noerr(&sani_sip, fp, sizeof (sani_sip));
		} else
			copyout_noerr(sip, fp, sizeof (*sip));
		sip_addr = (siginfo_t *)fp;

		if (sig == SIGPROF &&
		    curthread->t_rprof != NULL &&
		    curthread->t_rprof->rp_anystate) {
			/*
			 * We stand on our head to deal with
			 * the real time profiling signal.
			 * Fill in the stuff that doesn't fit
			 * in a normal k_siginfo structure.
			 */
			int i = sip->si_nsysarg;

			while (--i >= 0)
				suword32_noerr(&(sip_addr->si_sysarg[i]),
				    (uint32_t)lwp->lwp_arg[i]);
			copyout_noerr(curthread->t_rprof->rp_state,
			    sip_addr->si_mstate,
			    sizeof (curthread->t_rprof->rp_state));
		}
	} else
		sip_addr = NULL;

	/* save the current context on the user stack */
	fp -= SA(sizeof (*tuc));
	uc = (ucontext_t *)fp;
	tuc = kmem_alloc(sizeof (*tuc), KM_SLEEP);
	savecontext(tuc, &lwp->lwp_sigoldmask);
	copyout_noerr(tuc, uc, sizeof (*tuc));
	kmem_free(tuc, sizeof (*tuc));
	tuc = NULL;

	lwp->lwp_oldcontext = (uintptr_t)uc;

	if (newstack) {
		lwp->lwp_sigaltstack.ss_flags |= SS_ONSTACK;
		if (lwp->lwp_ustack)
			copyout_noerr(&lwp->lwp_sigaltstack,
			    (stack_t *)lwp->lwp_ustack, sizeof (stack_t));
	}

	/*
	 * Set up signal handler arguments
	 */
	{
		struct sigframe frame;

		frame.sip = sip_addr;
		frame.ucp = uc;
		frame.signo = sig;
		frame.retaddr = (void (*)())0xffffffff;	/* never return! */
		copyout_noerr(&frame, sp, sizeof (frame));
	}

	no_fault();
	if (watched)
		watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE);

	rp->r_sp = (greg_t)sp;
	rp->r_pc = (greg_t)hdlr;
	rp->r_ps = PSL_USER | (rp->r_ps & PS_IOPL);

	if ((rp->r_cs & 0xffff) != UCS_SEL ||
	    (rp->r_ss & 0xffff) != UDS_SEL) {
		rp->r_cs = UCS_SEL;
		rp->r_ss = UDS_SEL;
	}

	/*
	 * Don't set lwp_eosys here.  sendsig() is called via psig() after
	 * lwp_eosys is handled, so setting it here would affect the next
	 * system call.
	 */
	return (1);

badstack:
	no_fault();
	if (watched)
		watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE);
	if (tuc)
		kmem_free(tuc, sizeof (*tuc));
#ifdef DEBUG
	printf("sendsig: bad signal stack cmd=%s, pid=%d, sig=%d\n",
	    PTOU(p)->u_comm, p->p_pid, sig);
	printf("on fault, sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n",
	    (void *)sp, (void *)hdlr, (uintptr_t)upc);
#endif
	return (0);
}
Esempio n. 21
0
/*
 * Return value:
 *   1 - exitlwps() failed, call (or continue) lwp_exit()
 *   0 - restarting init.  Return through system call path
 */
int
proc_exit(int why, int what)
{
	kthread_t *t = curthread;
	klwp_t *lwp = ttolwp(t);
	proc_t *p = ttoproc(t);
	zone_t *z = p->p_zone;
	timeout_id_t tmp_id;
	int rv;
	proc_t *q;
	task_t *tk;
	vnode_t *exec_vp, *execdir_vp, *cdir, *rdir;
	sigqueue_t *sqp;
	lwpdir_t *lwpdir;
	uint_t lwpdir_sz;
	tidhash_t *tidhash;
	uint_t tidhash_sz;
	ret_tidhash_t *ret_tidhash;
	refstr_t *cwd;
	hrtime_t hrutime, hrstime;
	int evaporate;

	/*
	 * Stop and discard the process's lwps except for the current one,
	 * unless some other lwp beat us to it.  If exitlwps() fails then
	 * return and the calling lwp will call (or continue in) lwp_exit().
	 */
	proc_is_exiting(p);
	if (exitlwps(0) != 0)
		return (1);

	mutex_enter(&p->p_lock);
	if (p->p_ttime > 0) {
		/*
		 * Account any remaining ticks charged to this process
		 * on its way out.
		 */
		(void) task_cpu_time_incr(p->p_task, p->p_ttime);
		p->p_ttime = 0;
	}
	mutex_exit(&p->p_lock);

	DTRACE_PROC(lwp__exit);
	DTRACE_PROC1(exit, int, why);

	/*
	 * Will perform any brand specific proc exit processing, since this
	 * is always the last lwp, will also perform lwp_exit and free brand
	 * data
	 */
	if (PROC_IS_BRANDED(p)) {
		lwp_detach_brand_hdlrs(lwp);
		brand_clearbrand(p, B_FALSE);
	}

	/*
	 * Don't let init exit unless zone_start_init() failed its exec, or
	 * we are shutting down the zone or the machine.
	 *
	 * Since we are single threaded, we don't need to lock the
	 * following accesses to zone_proc_initpid.
	 */
	if (p->p_pid == z->zone_proc_initpid) {
		if (z->zone_boot_err == 0 &&
		    zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN &&
		    z->zone_restart_init == B_TRUE &&
		    restart_init(what, why) == 0)
			return (0);
		/*
		 * Since we didn't or couldn't restart init, we clear
		 * the zone's init state and proceed with exit
		 * processing.
		 */
		z->zone_proc_initpid = -1;
	}

	lwp_pcb_exit();

	/*
	 * Allocate a sigqueue now, before we grab locks.
	 * It will be given to sigcld(), below.
	 * Special case:  If we will be making the process disappear
	 * without a trace because it is either:
	 *	* an exiting SSYS process, or
	 *	* a posix_spawn() vfork child who requests it,
	 * we don't bother to allocate a useless sigqueue.
	 */
	evaporate = (p->p_flag & SSYS) || ((p->p_flag & SVFORK) &&
	    why == CLD_EXITED && what == _EVAPORATE);
	if (!evaporate)
		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);

	/*
	 * revoke any doors created by the process.
	 */
	if (p->p_door_list)
		door_exit();

	/*
	 * Release schedctl data structures.
	 */
	if (p->p_pagep)
		schedctl_proc_cleanup();

	/*
	 * make sure all pending kaio has completed.
	 */
	if (p->p_aio)
		aio_cleanup_exit();

	/*
	 * discard the lwpchan cache.
	 */
	if (p->p_lcp != NULL)
		lwpchan_destroy_cache(0);

	/*
	 * Clean up any DTrace helper actions or probes for the process.
	 */
	if (p->p_dtrace_helpers != NULL) {
		ASSERT(dtrace_helpers_cleanup != NULL);
		(*dtrace_helpers_cleanup)();
	}

	/* untimeout the realtime timers */
	if (p->p_itimer != NULL)
		timer_exit();

	if ((tmp_id = p->p_alarmid) != 0) {
		p->p_alarmid = 0;
		(void) untimeout(tmp_id);
	}

	/*
	 * Remove any fpollinfo_t's for this (last) thread from our file
	 * descriptors so closeall() can ASSERT() that they're all gone.
	 */
	pollcleanup();

	if (p->p_rprof_cyclic != CYCLIC_NONE) {
		mutex_enter(&cpu_lock);
		cyclic_remove(p->p_rprof_cyclic);
		mutex_exit(&cpu_lock);
	}

	mutex_enter(&p->p_lock);

	/*
	 * Clean up any DTrace probes associated with this process.
	 */
	if (p->p_dtrace_probes) {
		ASSERT(dtrace_fasttrap_exit_ptr != NULL);
		dtrace_fasttrap_exit_ptr(p);
	}

	while ((tmp_id = p->p_itimerid) != 0) {
		p->p_itimerid = 0;
		mutex_exit(&p->p_lock);
		(void) untimeout(tmp_id);
		mutex_enter(&p->p_lock);
	}

	lwp_cleanup();

	/*
	 * We are about to exit; prevent our resource associations from
	 * being changed.
	 */
	pool_barrier_enter();

	/*
	 * Block the process against /proc now that we have really
	 * acquired p->p_lock (to manipulate p_tlist at least).
	 */
	prbarrier(p);

	sigfillset(&p->p_ignore);
	sigemptyset(&p->p_siginfo);
	sigemptyset(&p->p_sig);
	sigemptyset(&p->p_extsig);
	sigemptyset(&t->t_sig);
	sigemptyset(&t->t_extsig);
	sigemptyset(&p->p_sigmask);
	sigdelq(p, t, 0);
	lwp->lwp_cursig = 0;
	lwp->lwp_extsig = 0;
	p->p_flag &= ~(SKILLED | SEXTKILLED);
	if (lwp->lwp_curinfo) {
		siginfofree(lwp->lwp_curinfo);
		lwp->lwp_curinfo = NULL;
	}

	t->t_proc_flag |= TP_LWPEXIT;
	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
	prlwpexit(t);		/* notify /proc */
	lwp_hash_out(p, t->t_tid);
	prexit(p);

	p->p_lwpcnt = 0;
	p->p_tlist = NULL;
	sigqfree(p);
	term_mstate(t);
	p->p_mterm = gethrtime();

	exec_vp = p->p_exec;
	execdir_vp = p->p_execdir;
	p->p_exec = NULLVP;
	p->p_execdir = NULLVP;
	mutex_exit(&p->p_lock);

	pr_free_watched_pages(p);

	closeall(P_FINFO(p));

	/* Free the controlling tty.  (freectty() always assumes curproc.) */
	ASSERT(p == curproc);
	(void) freectty(B_TRUE);

#if defined(__sparc)
	if (p->p_utraps != NULL)
		utrap_free(p);
#endif
	if (p->p_semacct)			/* IPC semaphore exit */
		semexit(p);
	rv = wstat(why, what);

	acct(rv & 0xff);
	exacct_commit_proc(p, rv);

	/*
	 * Release any resources associated with C2 auditing
	 */
	if (AU_AUDITING()) {
		/*
		 * audit exit system call
		 */
		audit_exit(why, what);
	}

	/*
	 * Free address space.
	 */
	relvm();

	if (exec_vp) {
		/*
		 * Close this executable which has been opened when the process
		 * was created by getproc().
		 */
		(void) VOP_CLOSE(exec_vp, FREAD, 1, (offset_t)0, CRED(), NULL);
		VN_RELE(exec_vp);
	}
	if (execdir_vp)
		VN_RELE(execdir_vp);

	/*
	 * Release held contracts.
	 */
	contract_exit(p);

	/*
	 * Depart our encapsulating process contract.
	 */
	if ((p->p_flag & SSYS) == 0) {
		ASSERT(p->p_ct_process);
		contract_process_exit(p->p_ct_process, p, rv);
	}

	/*
	 * Remove pool association, and block if requested by pool_do_bind.
	 */
	mutex_enter(&p->p_lock);
	ASSERT(p->p_pool->pool_ref > 0);
	atomic_add_32(&p->p_pool->pool_ref, -1);
	p->p_pool = pool_default;
	/*
	 * Now that our address space has been freed and all other threads
	 * in this process have exited, set the PEXITED pool flag.  This
	 * tells the pools subsystems to ignore this process if it was
	 * requested to rebind this process to a new pool.
	 */
	p->p_poolflag |= PEXITED;
	pool_barrier_exit();
	mutex_exit(&p->p_lock);

	mutex_enter(&pidlock);

	/*
	 * Delete this process from the newstate list of its parent. We
	 * will put it in the right place in the sigcld in the end.
	 */
	delete_ns(p->p_parent, p);

	/*
	 * Reassign the orphans to the next of kin.
	 * Don't rearrange init's orphanage.
	 */
	if ((q = p->p_orphan) != NULL && p != proc_init) {

		proc_t *nokp = p->p_nextofkin;

		for (;;) {
			q->p_nextofkin = nokp;
			if (q->p_nextorph == NULL)
				break;
			q = q->p_nextorph;
		}
		q->p_nextorph = nokp->p_orphan;
		nokp->p_orphan = p->p_orphan;
		p->p_orphan = NULL;
	}

	/*
	 * Reassign the children to init.
	 * Don't try to assign init's children to init.
	 */
	if ((q = p->p_child) != NULL && p != proc_init) {
		struct proc	*np;
		struct proc	*initp = proc_init;
		boolean_t	setzonetop = B_FALSE;

		if (!INGLOBALZONE(curproc))
			setzonetop = B_TRUE;

		pgdetach(p);

		do {
			np = q->p_sibling;
			/*
			 * Delete it from its current parent new state
			 * list and add it to init new state list
			 */
			delete_ns(q->p_parent, q);

			q->p_ppid = 1;
			q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
			if (setzonetop) {
				mutex_enter(&q->p_lock);
				q->p_flag |= SZONETOP;
				mutex_exit(&q->p_lock);
			}
			q->p_parent = initp;

			/*
			 * Since q will be the first child,
			 * it will not have a previous sibling.
			 */
			q->p_psibling = NULL;
			if (initp->p_child) {
				initp->p_child->p_psibling = q;
			}
			q->p_sibling = initp->p_child;
			initp->p_child = q;
			if (q->p_proc_flag & P_PR_PTRACE) {
				mutex_enter(&q->p_lock);
				sigtoproc(q, NULL, SIGKILL);
				mutex_exit(&q->p_lock);
			}
			/*
			 * sigcld() will add the child to parents
			 * newstate list.
			 */
			if (q->p_stat == SZOMB)
				sigcld(q, NULL);
		} while ((q = np) != NULL);

		p->p_child = NULL;
		ASSERT(p->p_child_ns == NULL);
	}

	TRACE_1(TR_FAC_PROC, TR_PROC_EXIT, "proc_exit: %p", p);

	mutex_enter(&p->p_lock);
	CL_EXIT(curthread); /* tell the scheduler that curthread is exiting */

	/*
	 * Have our task accummulate our resource usage data before they
	 * become contaminated by p_cacct etc., and before we renounce
	 * membership of the task.
	 *
	 * We do this regardless of whether or not task accounting is active.
	 * This is to avoid having nonsense data reported for this task if
	 * task accounting is subsequently enabled. The overhead is minimal;
	 * by this point, this process has accounted for the usage of all its
	 * LWPs. We nonetheless do the work here, and under the protection of
	 * pidlock, so that the movement of the process's usage to the task
	 * happens at the same time as the removal of the process from the
	 * task, from the point of view of exacct_snapshot_task_usage().
	 */
	exacct_update_task_mstate(p);

	hrutime = mstate_aggr_state(p, LMS_USER);
	hrstime = mstate_aggr_state(p, LMS_SYSTEM);
	p->p_utime = (clock_t)NSEC_TO_TICK(hrutime) + p->p_cutime;
	p->p_stime = (clock_t)NSEC_TO_TICK(hrstime) + p->p_cstime;

	p->p_acct[LMS_USER]	+= p->p_cacct[LMS_USER];
	p->p_acct[LMS_SYSTEM]	+= p->p_cacct[LMS_SYSTEM];
	p->p_acct[LMS_TRAP]	+= p->p_cacct[LMS_TRAP];
	p->p_acct[LMS_TFAULT]	+= p->p_cacct[LMS_TFAULT];
	p->p_acct[LMS_DFAULT]	+= p->p_cacct[LMS_DFAULT];
	p->p_acct[LMS_KFAULT]	+= p->p_cacct[LMS_KFAULT];
	p->p_acct[LMS_USER_LOCK] += p->p_cacct[LMS_USER_LOCK];
	p->p_acct[LMS_SLEEP]	+= p->p_cacct[LMS_SLEEP];
	p->p_acct[LMS_WAIT_CPU]	+= p->p_cacct[LMS_WAIT_CPU];
	p->p_acct[LMS_STOPPED]	+= p->p_cacct[LMS_STOPPED];

	p->p_ru.minflt	+= p->p_cru.minflt;
	p->p_ru.majflt	+= p->p_cru.majflt;
	p->p_ru.nswap	+= p->p_cru.nswap;
	p->p_ru.inblock	+= p->p_cru.inblock;
	p->p_ru.oublock	+= p->p_cru.oublock;
	p->p_ru.msgsnd	+= p->p_cru.msgsnd;
	p->p_ru.msgrcv	+= p->p_cru.msgrcv;
	p->p_ru.nsignals += p->p_cru.nsignals;
	p->p_ru.nvcsw	+= p->p_cru.nvcsw;
	p->p_ru.nivcsw	+= p->p_cru.nivcsw;
	p->p_ru.sysc	+= p->p_cru.sysc;
	p->p_ru.ioch	+= p->p_cru.ioch;

	p->p_stat = SZOMB;
	p->p_proc_flag &= ~P_PR_PTRACE;
	p->p_wdata = what;
	p->p_wcode = (char)why;

	cdir = PTOU(p)->u_cdir;
	rdir = PTOU(p)->u_rdir;
	cwd = PTOU(p)->u_cwd;

	ASSERT(cdir != NULL || p->p_parent == &p0);

	/*
	 * Release resource controls, as they are no longer enforceable.
	 */
	rctl_set_free(p->p_rctls);

	/*
	 * Decrement tk_nlwps counter for our task.max-lwps resource control.
	 * An extended accounting record, if that facility is active, is
	 * scheduled to be written.  We cannot give up task and project
	 * membership at this point because that would allow zombies to escape
	 * from the max-processes resource controls.  Zombies stay in their
	 * current task and project until the process table slot is released
	 * in freeproc().
	 */
	tk = p->p_task;

	mutex_enter(&p->p_zone->zone_nlwps_lock);
	tk->tk_nlwps--;
	tk->tk_proj->kpj_nlwps--;
	p->p_zone->zone_nlwps--;
	mutex_exit(&p->p_zone->zone_nlwps_lock);

	/*
	 * Clear the lwp directory and the lwpid hash table
	 * now that /proc can't bother us any more.
	 * We free the memory below, after dropping p->p_lock.
	 */
	lwpdir = p->p_lwpdir;
	lwpdir_sz = p->p_lwpdir_sz;
	tidhash = p->p_tidhash;
	tidhash_sz = p->p_tidhash_sz;
	ret_tidhash = p->p_ret_tidhash;
	p->p_lwpdir = NULL;
	p->p_lwpfree = NULL;
	p->p_lwpdir_sz = 0;
	p->p_tidhash = NULL;
	p->p_tidhash_sz = 0;
	p->p_ret_tidhash = NULL;

	/*
	 * If the process has context ops installed, call the exit routine
	 * on behalf of this last remaining thread. Normally exitpctx() is
	 * called during thread_exit() or lwp_exit(), but because this is the
	 * last thread in the process, we must call it here. By the time
	 * thread_exit() is called (below), the association with the relevant
	 * process has been lost.
	 *
	 * We also free the context here.
	 */
	if (p->p_pctx) {
		kpreempt_disable();
		exitpctx(p);
		kpreempt_enable();

		freepctx(p, 0);
	}

	/*
	 * curthread's proc pointer is changed to point to the 'sched'
	 * process for the corresponding zone, except in the case when
	 * the exiting process is in fact a zsched instance, in which
	 * case the proc pointer is set to p0.  We do so, so that the
	 * process still points at the right zone when we call the VN_RELE()
	 * below.
	 *
	 * This is because curthread's original proc pointer can be freed as
	 * soon as the child sends a SIGCLD to its parent.  We use zsched so
	 * that for user processes, even in the final moments of death, the
	 * process is still associated with its zone.
	 */
	if (p != t->t_procp->p_zone->zone_zsched)
		t->t_procp = t->t_procp->p_zone->zone_zsched;
	else
		t->t_procp = &p0;

	mutex_exit(&p->p_lock);
	if (!evaporate) {
		p->p_pidflag &= ~CLDPEND;
		sigcld(p, sqp);
	} else {
		/*
		 * Do what sigcld() would do if the disposition
		 * of the SIGCHLD signal were set to be ignored.
		 */
		cv_broadcast(&p->p_srwchan_cv);
		freeproc(p);
	}
	mutex_exit(&pidlock);

	/*
	 * We don't release u_cdir and u_rdir until SZOMB is set.
	 * This protects us against dofusers().
	 */
	if (cdir)
		VN_RELE(cdir);
	if (rdir)
		VN_RELE(rdir);
	if (cwd)
		refstr_rele(cwd);

	/*
	 * task_rele() may ultimately cause the zone to go away (or
	 * may cause the last user process in a zone to go away, which
	 * signals zsched to go away).  So prior to this call, we must
	 * no longer point at zsched.
	 */
	t->t_procp = &p0;

	kmem_free(lwpdir, lwpdir_sz * sizeof (lwpdir_t));
	kmem_free(tidhash, tidhash_sz * sizeof (tidhash_t));
	while (ret_tidhash != NULL) {
		ret_tidhash_t *next = ret_tidhash->rth_next;
		kmem_free(ret_tidhash->rth_tidhash,
		    ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t));
		kmem_free(ret_tidhash, sizeof (*ret_tidhash));
		ret_tidhash = next;
	}

	thread_exit();
	/* NOTREACHED */
}
Esempio n. 22
0
/*
 * Allocate a new lxproc node
 *
 * This also allocates the vnode associated with it
 */
lxpr_node_t *
lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd)
{
	lxpr_node_t *lxpnp;
	vnode_t *vp;
	user_t *up;
	timestruc_t now;

	/*
	 * Allocate a new node. It is deallocated in vop_innactive
	 */
	lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP);

	/*
	 * Set defaults (may be overridden below)
	 */
	gethrestime(&now);
	lxpnp->lxpr_type = type;
	lxpnp->lxpr_realvp = NULL;
	lxpnp->lxpr_parent = dp;
	VN_HOLD(dp);
	if (p != NULL) {
		lxpnp->lxpr_pid = ((p->p_pid ==
		    curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid);

		lxpnp->lxpr_time = PTOU(p)->u_start;
		lxpnp->lxpr_uid = crgetruid(p->p_cred);
		lxpnp->lxpr_gid = crgetrgid(p->p_cred);
		lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd);
	} else {
		/* Pretend files without a proc belong to sched */
		lxpnp->lxpr_pid = 0;
		lxpnp->lxpr_time = now;
		lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0;
		lxpnp->lxpr_ino = lxpr_inode(type, 0, 0);
	}

	/* initialize the vnode data */
	vp = lxpnp->lxpr_vnode;
	vn_reinit(vp);
	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
	vp->v_vfsp = dp->v_vfsp;

	/*
	 * Do node specific stuff
	 */
	switch (type) {
	case LXPR_PROCDIR:
		vp->v_flag |= VROOT;
		vp->v_type = VDIR;
		lxpnp->lxpr_mode = 0555;	/* read-search by everyone */
		break;

	case LXPR_PID_CURDIR:
		ASSERT(p != NULL);

		/*
		 * Zombie check.  p_stat is officially protected by pidlock,
		 * but we can't grab pidlock here because we already hold
		 * p_lock.  Luckily if we look at the process exit code
		 * we see that p_stat only transisions from SRUN to SZOMB
		 * while p_lock is held.  Aside from this, the only other
		 * p_stat transition that we need to be aware about is
		 * SIDL to SRUN, but that's not a problem since lxpr_lock()
		 * ignores nodes in the SIDL state so we'll never get a node
		 * that isn't already in the SRUN state.
		 */
		if (p->p_stat == SZOMB) {
			lxpnp->lxpr_realvp = NULL;
		} else {
			up = PTOU(p);
			lxpnp->lxpr_realvp = up->u_cdir;
			ASSERT(lxpnp->lxpr_realvp != NULL);
			VN_HOLD(lxpnp->lxpr_realvp);
		}
		vp->v_type = VLNK;
		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
		break;

	case LXPR_PID_ROOTDIR:
		ASSERT(p != NULL);
		/* Zombie check.  see locking comment above */
		if (p->p_stat == SZOMB) {
			lxpnp->lxpr_realvp = NULL;
		} else {
			up = PTOU(p);
			lxpnp->lxpr_realvp =
			    up->u_rdir != NULL ? up->u_rdir : rootdir;
			ASSERT(lxpnp->lxpr_realvp != NULL);
			VN_HOLD(lxpnp->lxpr_realvp);
		}
		vp->v_type = VLNK;
		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
		break;

	case LXPR_PID_EXE:
		ASSERT(p != NULL);
		lxpnp->lxpr_realvp = p->p_exec;
		if (lxpnp->lxpr_realvp != NULL) {
			VN_HOLD(lxpnp->lxpr_realvp);
		}
		vp->v_type = VLNK;
		lxpnp->lxpr_mode = 0777;
		break;

	case LXPR_SELF:
		vp->v_type = VLNK;
		lxpnp->lxpr_mode = 0777;	/* anyone does anything ! */
		break;

	case LXPR_PID_FD_FD:
		ASSERT(p != NULL);
		/* lxpr_realvp is set after we return */
		vp->v_type = VLNK;
		lxpnp->lxpr_mode = 0700;	/* read-write-exe owner only */
		break;

	case LXPR_PID_FDDIR:
		ASSERT(p != NULL);
		vp->v_type = VDIR;
		lxpnp->lxpr_mode = 0500;	/* read-search by owner only */
		break;

	case LXPR_PIDDIR:
		ASSERT(p != NULL);
		vp->v_type = VDIR;
		lxpnp->lxpr_mode = 0511;
		break;

	case LXPR_SYSDIR:
	case LXPR_SYS_FSDIR:
	case LXPR_SYS_FS_INOTIFYDIR:
	case LXPR_SYS_KERNELDIR:
	case LXPR_NETDIR:
		vp->v_type = VDIR;
		lxpnp->lxpr_mode = 0555;	/* read-search by all */
		break;

	case LXPR_PID_ENV:
	case LXPR_PID_MEM:
		ASSERT(p != NULL);
		/*FALLTHRU*/
	case LXPR_KCORE:
		vp->v_type = VREG;
		lxpnp->lxpr_mode = 0400;	/* read-only by owner only */
		break;

	default:
		vp->v_type = VREG;
		lxpnp->lxpr_mode = 0444;	/* read-only by all */
		break;
	}

	return (lxpnp);
}
Esempio n. 23
0
void
exacct_calculate_proc_usage(proc_t *p, proc_usage_t *pu, ulong_t *mask,
    int flag, int wstat)
{
	timestruc_t ts, ts_run;

	ASSERT(MUTEX_HELD(&p->p_lock));

	/*
	 * Convert CPU and execution times to sec/nsec format.
	 */
	if (BT_TEST(mask, AC_PROC_CPU)) {
		hrt2ts(mstate_aggr_state(p, LMS_USER), &ts);
		pu->pu_utimesec = (uint64_t)(ulong_t)ts.tv_sec;
		pu->pu_utimensec = (uint64_t)(ulong_t)ts.tv_nsec;
		hrt2ts(mstate_aggr_state(p, LMS_SYSTEM), &ts);
		pu->pu_stimesec = (uint64_t)(ulong_t)ts.tv_sec;
		pu->pu_stimensec = (uint64_t)(ulong_t)ts.tv_nsec;
	}
	if (BT_TEST(mask, AC_PROC_TIME)) {
		gethrestime(&ts);
		pu->pu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
		pu->pu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
		hrt2ts(gethrtime() - p->p_mstart, &ts_run);
		ts.tv_sec -= ts_run.tv_sec;
		ts.tv_nsec -= ts_run.tv_nsec;
		if (ts.tv_nsec < 0) {
			ts.tv_sec--;
			if ((ts.tv_nsec = ts.tv_nsec + NANOSEC) >= NANOSEC) {
				ts.tv_sec++;
				ts.tv_nsec -= NANOSEC;
			}
		}
		pu->pu_startsec = (uint64_t)(ulong_t)ts.tv_sec;
		pu->pu_startnsec = (uint64_t)(ulong_t)ts.tv_nsec;
	}

	pu->pu_pid = p->p_pidp->pid_id;
	pu->pu_acflag = p->p_user.u_acflag;
	pu->pu_projid = p->p_task->tk_proj->kpj_id;
	pu->pu_taskid = p->p_task->tk_tkid;
	pu->pu_major = getmajor(p->p_sessp->s_dev);
	pu->pu_minor = getminor(p->p_sessp->s_dev);
	pu->pu_ancpid = p->p_ancpid;
	pu->pu_wstat = wstat;
	/*
	 * Compute average RSS in K.  The denominator is the number of
	 * samples:  the number of clock ticks plus the initial value.
	 */
	pu->pu_mem_rss_avg = (PTOU(p)->u_mem / (p->p_stime + p->p_utime + 1)) *
	    (PAGESIZE / 1024);
	pu->pu_mem_rss_max = PTOU(p)->u_mem_max * (PAGESIZE / 1024);

	mutex_enter(&p->p_crlock);
	pu->pu_ruid = crgetruid(p->p_cred);
	pu->pu_rgid = crgetrgid(p->p_cred);
	mutex_exit(&p->p_crlock);

	bcopy(p->p_user.u_comm, pu->pu_command, strlen(p->p_user.u_comm) + 1);
	bcopy(p->p_zone->zone_name, pu->pu_zonename,
	    strlen(p->p_zone->zone_name) + 1);
	bcopy(p->p_zone->zone_nodename, pu->pu_nodename,
	    strlen(p->p_zone->zone_nodename) + 1);

	/*
	 * Calculate microstate accounting data for a process that is still
	 * running.  Presently, we explicitly collect all of the LWP usage into
	 * the proc usage structure here.
	 */
	if (flag & EW_PARTIAL)
		exacct_calculate_proc_mstate(p, pu);
	if (flag & EW_FINAL)
		exacct_copy_proc_mstate(p, pu);
}
Esempio n. 24
0
/*
 * Post-syscall processing.  Perform abnormal system call completion
 * actions such as /proc tracing, profiling, signals, preemption, etc.
 *
 * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
 * Any condition requiring pre-syscall handling must set one of these.
 * If the condition is persistent, this routine will repost t_post_sys.
 */
void
post_syscall(long rval1, long rval2)
{
	kthread_t *t = curthread;
	klwp_t *lwp = ttolwp(t);
	proc_t *p = ttoproc(t);
	struct regs *rp = lwptoregs(lwp);
	uint_t	error;
	uint_t	code = t->t_sysnum;
	int	repost = 0;
	int	proc_stop = 0;		/* non-zero if stopping */
	int	sigprof = 0;		/* non-zero if sending SIGPROF */

	t->t_post_sys = 0;

	error = lwp->lwp_errno;

	/*
	 * Code can be zero if this is a new LWP returning after a forkall(),
	 * other than the one which matches the one in the parent which called
	 * forkall().  In these LWPs, skip most of post-syscall activity.
	 */
	if (code == 0)
		goto sig_check;
	/*
	 * If the trace flag is set, mark the lwp to take a single-step trap
	 * on return to user level (below). The x86 lcall interface and
	 * sysenter has already done this, and turned off the flag, but
	 * amd64 syscall interface has not.
	 */
	if (rp->r_ps & PS_T) {
		lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING;
		rp->r_ps &= ~PS_T;
		aston(curthread);
	}
#ifdef C2_AUDIT
	if (audit_active) {	/* put out audit record for this syscall */
		rval_t	rval;

		/* XX64 -- truncation of 64-bit return values? */
		rval.r_val1 = (int)rval1;
		rval.r_val2 = (int)rval2;
		audit_finish(T_SYSCALL, code, error, &rval);
		repost = 1;
	}
#endif /* C2_AUDIT */

	if (curthread->t_pdmsg != NULL) {
		char *m = curthread->t_pdmsg;

		uprintf("%s", m);
		kmem_free(m, strlen(m) + 1);
		curthread->t_pdmsg = NULL;
	}

	/*
	 * If we're going to stop for /proc tracing, set the flag and
	 * save the arguments so that the return values don't smash them.
	 */
	if (PTOU(p)->u_systrap) {
		if (prismember(&PTOU(p)->u_exitmask, code)) {
			if (lwp_getdatamodel(lwp) == DATAMODEL_LP64)
				(void) save_syscall_args();
			proc_stop = 1;
		}
		repost = 1;
	}

	/*
	 * Similarly check to see if SIGPROF might be sent.
	 */
	if (curthread->t_rprof != NULL &&
	    curthread->t_rprof->rp_anystate != 0) {
		if (lwp_getdatamodel(lwp) == DATAMODEL_LP64)
			(void) save_syscall_args();
		sigprof = 1;
	}

	if (lwp->lwp_eosys == NORMALRETURN) {
		if (error == 0) {
#ifdef SYSCALLTRACE
			if (syscalltrace) {
				mutex_enter(&systrace_lock);
				printf(
				    "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
				    p->p_pid, rval1, rval2, curthread);
				mutex_exit(&systrace_lock);
			}
#endif /* SYSCALLTRACE */
			rp->r_ps &= ~PS_C;
			rp->r_r0 = rval1;
			rp->r_r1 = rval2;
		} else {
			int sig;
#ifdef SYSCALLTRACE
			if (syscalltrace) {
				mutex_enter(&systrace_lock);
				printf("%d: error=%d, id 0x%p\n",
				    p->p_pid, error, curthread);
				mutex_exit(&systrace_lock);
			}
#endif /* SYSCALLTRACE */
			if (error == EINTR && t->t_activefd.a_stale)
				error = EBADF;
			if (error == EINTR &&
			    (sig = lwp->lwp_cursig) != 0 &&
			    sigismember(&PTOU(p)->u_sigrestart, sig) &&
			    PTOU(p)->u_signal[sig - 1] != SIG_DFL &&
			    PTOU(p)->u_signal[sig - 1] != SIG_IGN)
				error = ERESTART;
			rp->r_r0 = error;
			rp->r_ps |= PS_C;
		}
	}

	/*
	 * From the proc(4) manual page:
	 * When exit from a system call is being traced, the traced process
	 * stops on completion of the system call just prior to checking for
	 * signals and returning to user level.  At this point all return
	 * values have been stored into the traced process's saved registers.
	 */
	if (proc_stop) {
		mutex_enter(&p->p_lock);
		if (PTOU(p)->u_systrap &&
		    prismember(&PTOU(p)->u_exitmask, code))
			stop(PR_SYSEXIT, code);
		mutex_exit(&p->p_lock);
	}

	/*
	 * If we are the parent returning from a successful
	 * vfork, wait for the child to exec or exit.
	 * This code must be here and not in the bowels of the system
	 * so that /proc can intercept exit from vfork in a timely way.
	 */
	if (code == SYS_vfork && rp->r_r1 == 0 && error == 0)
		vfwait((pid_t)rval1);

	/*
	 * If profiling is active, bill the current PC in user-land
	 * and keep reposting until profiling is disabled.
	 */
	if (p->p_prof.pr_scale) {
		if (lwp->lwp_oweupc)
			profil_tick(rp->r_pc);
		repost = 1;
	}

sig_check:
	/*
	 * Reset flag for next time.
	 * We must do this after stopping on PR_SYSEXIT
	 * because /proc uses the information in lwp_eosys.
	 */
	lwp->lwp_eosys = NORMALRETURN;
	clear_stale_fd();
	t->t_flag &= ~T_FORKALL;

	if (t->t_astflag | t->t_sig_check) {
		/*
		 * Turn off the AST flag before checking all the conditions that
		 * may have caused an AST.  This flag is on whenever a signal or
		 * unusual condition should be handled after the next trap or
		 * syscall.
		 */
		astoff(t);
		/*
		 * If a single-step trap occurred on a syscall (see trap())
		 * recognize it now.  Do this before checking for signals
		 * because deferred_singlestep_trap() may generate a SIGTRAP to
		 * the LWP or may otherwise mark the LWP to call issig(FORREAL).
		 */
		if (lwp->lwp_pcb.pcb_flags & DEBUG_PENDING)
			deferred_singlestep_trap((caddr_t)rp->r_pc);

		t->t_sig_check = 0;

		/*
		 * The following check is legal for the following reasons:
		 *	1) The thread we are checking, is ourselves, so there is
		 *	   no way the proc can go away.
		 *	2) The only time we need to be protected by the
		 *	   lock is if the binding is changed.
		 *
		 *	Note we will still take the lock and check the binding
		 *	if the condition was true without the lock held.  This
		 *	prevents lock contention among threads owned by the
		 * 	same proc.
		 */

		if (curthread->t_proc_flag & TP_CHANGEBIND) {
			mutex_enter(&p->p_lock);
			if (curthread->t_proc_flag & TP_CHANGEBIND) {
				timer_lwpbind();
				curthread->t_proc_flag &= ~TP_CHANGEBIND;
			}
			mutex_exit(&p->p_lock);
		}

		/*
		 * for kaio requests on the special kaio poll queue,
		 * copyout their results to user memory.
		 */
		if (p->p_aio)
			aio_cleanup(0);
		/*
		 * If this LWP was asked to hold, call holdlwp(), which will
		 * stop.  holdlwps() sets this up and calls pokelwps() which
		 * sets the AST flag.
		 *
		 * Also check TP_EXITLWP, since this is used by fresh new LWPs
		 * through lwp_rtt().  That flag is set if the lwp_create(2)
		 * syscall failed after creating the LWP.
		 */
		if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP))
			holdlwp();

		/*
		 * All code that sets signals and makes ISSIG_PENDING
		 * evaluate true must set t_sig_check afterwards.
		 */
		if (ISSIG_PENDING(t, lwp, p)) {
			if (issig(FORREAL))
				psig();
			t->t_sig_check = 1;	/* recheck next time */
		}

		if (sigprof) {
			realsigprof(code, error);
			t->t_sig_check = 1;	/* recheck next time */
		}

		/*
		 * If a performance counter overflow interrupt was
		 * delivered *during* the syscall, then re-enable the
		 * AST so that we take a trip through trap() to cause
		 * the SIGEMT to be delivered.
		 */
		if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW)
			aston(t);

		/*
		 * /proc can't enable/disable the trace bit itself
		 * because that could race with the call gate used by
		 * system calls via "lcall". If that happened, an
		 * invalid EFLAGS would result. prstep()/prnostep()
		 * therefore schedule an AST for the purpose.
		 */
		if (lwp->lwp_pcb.pcb_flags & REQUEST_STEP) {
			lwp->lwp_pcb.pcb_flags &= ~REQUEST_STEP;
			rp->r_ps |= PS_T;
		}
		if (lwp->lwp_pcb.pcb_flags & REQUEST_NOSTEP) {
			lwp->lwp_pcb.pcb_flags &= ~REQUEST_NOSTEP;
			rp->r_ps &= ~PS_T;
		}
	}

	lwp->lwp_errno = 0;		/* clear error for next time */

#ifndef NPROBE
	/* Kernel probe */
	if (tnf_tracing_active) {
		TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */,
			tnf_long,	rval1,		rval1,
			tnf_long,	rval2,		rval2,
			tnf_long,	errno,		(long)error);
		repost = 1;
	}
#endif /* NPROBE */

	/*
	 * Set state to LWP_USER here so preempt won't give us a kernel
	 * priority if it occurs after this point.  Call CL_TRAPRET() to
	 * restore the user-level priority.
	 *
	 * It is important that no locks (other than spinlocks) be entered
	 * after this point before returning to user mode (unless lwp_state
	 * is set back to LWP_SYS).
	 *
	 * XXX Sampled times past this point are charged to the user.
	 */
	lwp->lwp_state = LWP_USER;

	if (t->t_trapret) {
		t->t_trapret = 0;
		thread_lock(t);
		CL_TRAPRET(t);
		thread_unlock(t);
	}
	if (CPU->cpu_runrun)
		preempt();

	lwp->lwp_errno = 0;		/* clear error for next time */

	/*
	 * The thread lock must be held in order to clear sysnum and reset
	 * lwp_ap atomically with respect to other threads in the system that
	 * may be looking at the args via lwp_ap from get_syscall_args().
	 */

	thread_lock(t);
	t->t_sysnum = 0;		/* no longer in a system call */

	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
#if defined(_LP64)
		/*
		 * In case the args were copied to the lwp, reset the
		 * pointer so the next syscall will have the right
		 * lwp_ap pointer.
		 */
		lwp->lwp_ap = (long *)&rp->r_rdi;
	} else {
#endif
		lwp->lwp_ap = NULL;	/* reset on every syscall entry */
	}
	thread_unlock(t);

	lwp->lwp_argsaved = 0;

	/*
	 * If there was a continuing reason for post-syscall processing,
	 * set the t_post_sys flag for the next system call.
	 */
	if (repost)
		t->t_post_sys = 1;

	/*
	 * If there is a ustack registered for this lwp, and the stack rlimit
	 * has been altered, read in the ustack. If the saved stack rlimit
	 * matches the bounds of the ustack, update the ustack to reflect
	 * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
	 * stack checking by setting the size to 0.
	 */
	if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) {
		rlim64_t new_size;
		caddr_t top;
		stack_t stk;
		struct rlimit64 rl;

		mutex_enter(&p->p_lock);
		new_size = p->p_stk_ctl;
		top = p->p_usrstack;
		(void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl);
		mutex_exit(&p->p_lock);

		if (rl.rlim_cur == RLIM64_INFINITY)
			new_size = 0;

		if (copyin((stack_t *)lwp->lwp_ustack, &stk,
		    sizeof (stack_t)) == 0 &&
		    (stk.ss_size == lwp->lwp_old_stk_ctl ||
			stk.ss_size == 0) &&
		    stk.ss_sp == top - stk.ss_size) {
			stk.ss_sp = (void *)((uintptr_t)stk.ss_sp +
			    stk.ss_size - (uintptr_t)new_size);
			stk.ss_size = new_size;

			(void) copyout(&stk, (stack_t *)lwp->lwp_ustack,
			    sizeof (stack_t));
		}

		lwp->lwp_old_stk_ctl = 0;
	}
}
Esempio n. 25
0
/*
 * This routine assumes that the stack grows downward.
 * Returns 0 on success, errno on failure.
 */
int
grow_internal(caddr_t sp, uint_t growszc)
{
	struct proc *p = curproc;
	size_t newsize;
	size_t oldsize;
	int    error;
	size_t pgsz;
	uint_t szc;
	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);

	ASSERT(sp < p->p_usrstack);
	sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);

	/*
	 * grow to growszc alignment but use current p->p_stkpageszc for
	 * the segvn_crargs szc passed to segvn_create. For memcntl to
	 * increase the szc, this allows the new extension segment to be
	 * concatenated successfully with the existing stack segment.
	 */
	if ((szc = growszc) != 0) {
		pgsz = page_get_pagesize(szc);
		ASSERT(pgsz > PAGESIZE);
		newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
		if (newsize > (size_t)p->p_stk_ctl) {
			szc = 0;
			pgsz = PAGESIZE;
			newsize = p->p_usrstack - sp;
		}
	} else {
		pgsz = PAGESIZE;
		newsize = p->p_usrstack - sp;
	}

	if (newsize > (size_t)p->p_stk_ctl) {
		(void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
		    RCA_UNSAFE_ALL);

		return (ENOMEM);
	}

	oldsize = p->p_stksize;
	ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);

	if (newsize <= oldsize) {	/* prevent the stack from shrinking */
		return (0);
	}

	if (!(p->p_stkprot & PROT_EXEC)) {
		crargs.prot &= ~PROT_EXEC;
	}
	/*
	 * extend stack with the proposed new growszc, which is different
	 * than p_stkpageszc only on a memcntl to increase the stack pagesize.
	 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
	 * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
	 * if not aligned to szc's pgsz.
	 */
	if (szc > 0) {
		caddr_t oldsp = p->p_usrstack - oldsize;
		caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
		    pgsz);

		if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
			crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
			    AS_MAP_NO_LPOOB;
		} else if (oldsp == austk) {
			crargs.szc = szc;
		} else {
			crargs.szc = AS_MAP_STACK;
		}
	} else {
		crargs.szc = AS_MAP_NO_LPOOB;
	}
	crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;

	if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
	    segvn_create, &crargs)) != 0) {
		if (error == EAGAIN) {
			cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
			    "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
		}
		return (error);
	}
	p->p_stksize = newsize;
	return (0);
}