int umask(int mask) { mode_t t; t = PTOU(curproc)->u_cmask; PTOU(curproc)->u_cmask = (mode_t)(mask & PERMMASK); return ((int)t); }
/* * Kernel setup code, called from startup(). */ void kern_setup1(void) { proc_t *pp; pp = &p0; proc_sched = pp; /* * Initialize process 0 data structures */ pp->p_stat = SRUN; pp->p_flag = SSYS; pp->p_pidp = &pid0; pp->p_pgidp = &pid0; pp->p_sessp = &session0; pp->p_tlist = &t0; pid0.pid_pglink = pp; pid0.pid_pgtail = pp; /* * XXX - we asssume that the u-area is zeroed out except for * ttolwp(curthread)->lwp_regs. */ PTOU(curproc)->u_cmask = (mode_t)CMASK; thread_init(); /* init thread_free list */ pid_init(); /* initialize pid (proc) table */ contract_init(); /* initialize contracts */ init_pages_pp_maximum(); }
/* * Lookup the user file name from a given vp, using a specific credential. */ int lookuppnatcred( struct pathname *pnp, /* pathname to lookup */ struct pathname *rpnp, /* if non-NULL, return resolved path */ int followlink, /* (don't) follow sym links */ vnode_t **dirvpp, /* ptr for parent vnode */ vnode_t **compvpp, /* ptr for entry vnode */ vnode_t *startvp, /* start search from this vp */ cred_t *cr) /* user credential */ { vnode_t *vp; /* current directory vp */ vnode_t *rootvp; proc_t *p = curproc; if (pnp->pn_pathlen == 0) return (ENOENT); mutex_enter(&p->p_lock); /* for u_rdir and u_cdir */ if ((rootvp = PTOU(p)->u_rdir) == NULL) rootvp = rootdir; else if (rootvp != rootdir) /* no need to VN_HOLD rootdir */ VN_HOLD(rootvp); if (pnp->pn_path[0] == '/') { vp = rootvp; } else { vp = (startvp == NULL) ? PTOU(p)->u_cdir : startvp; } VN_HOLD(vp); mutex_exit(&p->p_lock); /* * Skip over leading slashes */ if (pnp->pn_path[0] == '/') { do { pnp->pn_path++; pnp->pn_pathlen--; } while (pnp->pn_path[0] == '/'); } return (lookuppnvp(pnp, rpnp, followlink, dirvpp, compvpp, rootvp, vp, cr)); }
/* * Make a directory. */ int mkdir(char *dname, int dmode) { vnode_t *vp; struct vattr vattr; int error; vattr.va_type = VDIR; vattr.va_mode = dmode & PERMMASK; vattr.va_mask = AT_TYPE|AT_MODE; error = vn_create(dname, UIO_USERSPACE, &vattr, EXCL, 0, &vp, CRMKDIR, 0, PTOU(curproc)->u_cmask); if (error) return (set_errno(error)); VN_RELE(vp); return (0); }
/* * Native processes are started with the native ld.so.1 as the command. This * brand op is invoked by s10_npreload to fix up the command and arguments * so that apps like pgrep or ps see the expected command strings. */ int s10_native(void *cmd, void *args) { struct user *up = PTOU(curproc); char cmd_buf[MAXCOMLEN + 1]; char arg_buf[PSARGSZ]; if (copyin(cmd, &cmd_buf, sizeof (cmd_buf)) != 0) return (EFAULT); if (copyin(args, &arg_buf, sizeof (arg_buf)) != 0) return (EFAULT); /* * Make sure that the process' interpreter is the native dynamic linker. * Convention dictates that native processes executing within solaris10- * branded zones are interpreted by the native dynamic linker (the * process and its arguments are specified as arguments to the dynamic * linker). If this convention is violated (i.e., * brandsys(B_S10_NATIVE, ...) is invoked by a process that shouldn't be * native), then do nothing and silently indicate success. */ if (strcmp(up->u_comm, S10_LINKER_NAME) != 0) return (0); /* * The sizeof has an extra value for the trailing '\0' so this covers * the appended " " in the following strcmps. */ if (strncmp(up->u_psargs, BRAND_NATIVE_LINKER64 " ", sizeof (BRAND_NATIVE_LINKER64)) != 0 && strncmp(up->u_psargs, BRAND_NATIVE_LINKER32 " ", sizeof (BRAND_NATIVE_LINKER32)) != 0) return (0); mutex_enter(&curproc->p_lock); (void) strlcpy(up->u_comm, cmd_buf, sizeof (up->u_comm)); (void) strlcpy(up->u_psargs, arg_buf, sizeof (up->u_psargs)); mutex_exit(&curproc->p_lock); return (0); }
/* * Perform pre-system-call processing, including stopping for tracing, * auditing, microstate-accounting, etc. * * This routine is called only if the t_pre_sys flag is set. Any condition * requiring pre-syscall handling must set the t_pre_sys flag. If the * condition is persistent, this routine will repost t_pre_sys. */ int pre_syscall(int arg0) { unsigned int code; kthread_t *t = curthread; proc_t *p = ttoproc(t); klwp_t *lwp = ttolwp(t); struct regs *rp = lwptoregs(lwp); int repost; t->t_pre_sys = repost = 0; /* clear pre-syscall processing flag */ ASSERT(t->t_schedflag & TS_DONT_SWAP); syscall_mstate(LMS_USER, LMS_SYSTEM); /* * The syscall arguments in the out registers should be pointed to * by lwp_ap. If the args need to be copied so that the outs can * be changed without losing the ability to get the args for /proc, * they can be saved by save_syscall_args(), and lwp_ap will be * restored by post_syscall(). */ ASSERT(lwp->lwp_ap == (long *)&rp->r_o0); /* * Make sure the thread is holding the latest credentials for the * process. The credentials in the process right now apply to this * thread for the entire system call. */ if (t->t_cred != p->p_cred) { cred_t *oldcred = t->t_cred; /* * DTrace accesses t_cred in probe context. t_cred must * always be either NULL, or point to a valid, allocated cred * structure. */ t->t_cred = crgetcred(); crfree(oldcred); } /* * Undo special arrangements to single-step the lwp * so that a debugger will see valid register contents. * Also so that the pc is valid for syncfpu(). * Also so that a syscall like exec() can be stepped. */ if (lwp->lwp_pcb.pcb_step != STEP_NONE) { (void) prundostep(); repost = 1; } /* * Check for indirect system call in case we stop for tracing. * Don't allow multiple indirection. */ code = t->t_sysnum; if (code == 0 && arg0 != 0) { /* indirect syscall */ code = arg0; t->t_sysnum = arg0; } /* * From the proc(4) manual page: * When entry to a system call is being traced, the traced process * stops after having begun the call to the system but before the * system call arguments have been fetched from the process. * If proc changes the args we must refetch them after starting. */ if (PTOU(p)->u_systrap) { if (prismember(&PTOU(p)->u_entrymask, code)) { /* * Recheck stop condition, now that lock is held. */ mutex_enter(&p->p_lock); if (PTOU(p)->u_systrap && prismember(&PTOU(p)->u_entrymask, code)) { stop(PR_SYSENTRY, code); /* * Must refetch args since they were * possibly modified by /proc. Indicate * that the valid copy is in the * registers. */ lwp->lwp_argsaved = 0; lwp->lwp_ap = (long *)&rp->r_o0; } mutex_exit(&p->p_lock); } repost = 1; } if (lwp->lwp_sysabort) { /* * lwp_sysabort may have been set via /proc while the process * was stopped on PR_SYSENTRY. If so, abort the system call. * Override any error from the copyin() of the arguments. */ lwp->lwp_sysabort = 0; (void) set_errno(EINTR); /* sets post-sys processing */ t->t_pre_sys = 1; /* repost anyway */ return (1); /* don't do system call, return EINTR */ } #ifdef C2_AUDIT if (audit_active) { /* begin auditing for this syscall */ int error; if (error = audit_start(T_SYSCALL, code, 0, lwp)) { t->t_pre_sys = 1; /* repost anyway */ lwp->lwp_error = 0; /* for old drivers */ return (error); } repost = 1; } #endif /* C2_AUDIT */ #ifndef NPROBE /* Kernel probe */ if (tnf_tracing_active) { TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */, tnf_sysnum, sysnum, t->t_sysnum); t->t_post_sys = 1; /* make sure post_syscall runs */ repost = 1; } #endif /* NPROBE */ #ifdef SYSCALLTRACE if (syscalltrace) { int i; long *ap; char *cp; char *sysname; struct sysent *callp; if (code >= NSYSCALL) callp = &nosys_ent; /* nosys has no args */ else callp = LWP_GETSYSENT(lwp) + code; (void) save_syscall_args(); mutex_enter(&systrace_lock); printf("%d: ", p->p_pid); if (code >= NSYSCALL) printf("0x%x", code); else { sysname = mod_getsysname(code); printf("%s[0x%x]", sysname == NULL ? "NULL" : sysname, code); } cp = "("; for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) { printf("%s%lx", cp, *ap); cp = ", "; } if (i) printf(")"); printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread); mutex_exit(&systrace_lock); } #endif /* SYSCALLTRACE */ /* * If there was a continuing reason for pre-syscall processing, * set the t_pre_sys flag for the next system call. */ if (repost) t->t_pre_sys = 1; lwp->lwp_error = 0; /* for old drivers */ lwp->lwp_badpriv = PRIV_NONE; /* for privilege tracing */ return (0); }
static int chdirec(vnode_t *vp, int ischroot, int do_traverse) { int error; vnode_t *oldvp; proc_t *pp = curproc; vnode_t **vpp; refstr_t *cwd; int newcwd = 1; if (vp->v_type != VDIR) { error = ENOTDIR; goto bad; } if (error = VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) goto bad; /* * The VOP_ACCESS() may have covered 'vp' with a new filesystem, * if 'vp' is an autoFS vnode. Traverse the mountpoint so * that we don't end up with a covered current directory. */ if (vn_mountedvfs(vp) != NULL && do_traverse) { if (error = traverse(&vp)) goto bad; } /* * Special chroot semantics: chroot is allowed if privileged * or if the target is really a loopback mount of the root (or * root of the zone) as determined by comparing dev and inode * numbers */ if (ischroot) { struct vattr tattr; struct vattr rattr; vnode_t *zonevp = curproc->p_zone->zone_rootvp; tattr.va_mask = AT_FSID|AT_NODEID; if (error = VOP_GETATTR(vp, &tattr, 0, CRED(), NULL)) goto bad; rattr.va_mask = AT_FSID|AT_NODEID; if (error = VOP_GETATTR(zonevp, &rattr, 0, CRED(), NULL)) goto bad; if ((tattr.va_fsid != rattr.va_fsid || tattr.va_nodeid != rattr.va_nodeid) && (error = secpolicy_chroot(CRED())) != 0) goto bad; vpp = &PTOU(pp)->u_rdir; } else { vpp = &PTOU(pp)->u_cdir; } if (audit_active) /* update abs cwd/root path see c2audit.c */ audit_chdirec(vp, vpp); mutex_enter(&pp->p_lock); /* * This bit of logic prevents us from overwriting u_cwd if we are * changing to the same directory. We set the cwd to NULL so that we * don't try to do the lookup on the next call to getcwd(). */ if (!ischroot && *vpp != NULL && vp != NULL && VN_CMP(*vpp, vp)) newcwd = 0; oldvp = *vpp; *vpp = vp; if ((cwd = PTOU(pp)->u_cwd) != NULL && newcwd) PTOU(pp)->u_cwd = NULL; mutex_exit(&pp->p_lock); if (cwd && newcwd) refstr_rele(cwd); if (oldvp) VN_RELE(oldvp); return (0); bad: VN_RELE(vp); return (error); }
/* * Perform pre-system-call processing, including stopping for tracing, * auditing, etc. * * This routine is called only if the t_pre_sys flag is set. Any condition * requiring pre-syscall handling must set the t_pre_sys flag. If the * condition is persistent, this routine will repost t_pre_sys. */ int pre_syscall() { kthread_t *t = curthread; unsigned code = t->t_sysnum; klwp_t *lwp = ttolwp(t); proc_t *p = ttoproc(t); int repost; t->t_pre_sys = repost = 0; /* clear pre-syscall processing flag */ ASSERT(t->t_schedflag & TS_DONT_SWAP); #if defined(DEBUG) /* * On the i386 kernel, lwp_ap points at the piece of the thread * stack that we copy the users arguments into. * * On the amd64 kernel, the syscall arguments in the rdi..r9 * registers should be pointed at by lwp_ap. If the args need to * be copied so that those registers can be changed without losing * the ability to get the args for /proc, they can be saved by * save_syscall_args(), and lwp_ap will be restored by post_syscall(). */ if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { #if defined(_LP64) ASSERT(lwp->lwp_ap == (long *)&lwptoregs(lwp)->r_rdi); } else { #endif ASSERT((caddr_t)lwp->lwp_ap > t->t_stkbase && (caddr_t)lwp->lwp_ap < t->t_stk); } #endif /* DEBUG */ /* * Make sure the thread is holding the latest credentials for the * process. The credentials in the process right now apply to this * thread for the entire system call. */ if (t->t_cred != p->p_cred) { cred_t *oldcred = t->t_cred; /* * DTrace accesses t_cred in probe context. t_cred must * always be either NULL, or point to a valid, allocated cred * structure. */ t->t_cred = crgetcred(); crfree(oldcred); } /* * From the proc(4) manual page: * When entry to a system call is being traced, the traced process * stops after having begun the call to the system but before the * system call arguments have been fetched from the process. */ if (PTOU(p)->u_systrap) { if (prismember(&PTOU(p)->u_entrymask, code)) { mutex_enter(&p->p_lock); /* * Recheck stop condition, now that lock is held. */ if (PTOU(p)->u_systrap && prismember(&PTOU(p)->u_entrymask, code)) { stop(PR_SYSENTRY, code); /* * /proc may have modified syscall args, * either in regs for amd64 or on ustack * for ia32. Either way, arrange to * copy them again, both for the syscall * handler and for other consumers in * post_syscall (like audit). Here, we * only do amd64, and just set lwp_ap * back to the kernel-entry stack copy; * the syscall ml code redoes * move-from-regs to set up for the * syscall handler after we return. For * ia32, save_syscall_args() below makes * an lwp_ap-accessible copy. */ #if defined(_LP64) if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { lwp->lwp_argsaved = 0; lwp->lwp_ap = (long *)&lwptoregs(lwp)->r_rdi; } #endif } mutex_exit(&p->p_lock); } repost = 1; } /* * ia32 kernel, or ia32 proc on amd64 kernel: keep args in * lwp_arg for post-syscall processing, regardless of whether * they might have been changed in /proc above. */ #if defined(_LP64) if (lwp_getdatamodel(lwp) != DATAMODEL_NATIVE) #endif (void) save_syscall_args(); if (lwp->lwp_sysabort) { /* * lwp_sysabort may have been set via /proc while the process * was stopped on PR_SYSENTRY. If so, abort the system call. * Override any error from the copyin() of the arguments. */ lwp->lwp_sysabort = 0; (void) set_errno(EINTR); /* forces post_sys */ t->t_pre_sys = 1; /* repost anyway */ return (1); /* don't do system call, return EINTR */ } #ifdef C2_AUDIT if (audit_active) { /* begin auditing for this syscall */ int error; if (error = audit_start(T_SYSCALL, code, 0, lwp)) { t->t_pre_sys = 1; /* repost anyway */ (void) set_errno(error); return (1); } repost = 1; } #endif /* C2_AUDIT */ #ifndef NPROBE /* Kernel probe */ if (tnf_tracing_active) { TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */, tnf_sysnum, sysnum, t->t_sysnum); t->t_post_sys = 1; /* make sure post_syscall runs */ repost = 1; } #endif /* NPROBE */ #ifdef SYSCALLTRACE if (syscalltrace) { int i; long *ap; char *cp; char *sysname; struct sysent *callp; if (code >= NSYSCALL) callp = &nosys_ent; /* nosys has no args */ else callp = LWP_GETSYSENT(lwp) + code; (void) save_syscall_args(); mutex_enter(&systrace_lock); printf("%d: ", p->p_pid); if (code >= NSYSCALL) printf("0x%x", code); else { sysname = mod_getsysname(code); printf("%s[0x%x/0x%p]", sysname == NULL ? "NULL" : sysname, code, callp->sy_callc); } cp = "("; for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) { printf("%s%lx", cp, *ap); cp = ", "; } if (i) printf(")"); printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread); mutex_exit(&systrace_lock); } #endif /* SYSCALLTRACE */ /* * If there was a continuing reason for pre-syscall processing, * set the t_pre_sys flag for the next system call. */ if (repost) t->t_pre_sys = 1; lwp->lwp_error = 0; /* for old drivers */ lwp->lwp_badpriv = PRIV_NONE; return (0); }
static int copen(int startfd, char *fname, int filemode, int createmode) { struct pathname pn; vnode_t *vp, *sdvp; file_t *fp, *startfp; enum vtype type; int error; int fd, dupfd; vnode_t *startvp; proc_t *p = curproc; if (startfd == AT_FDCWD) { /* * Regular open() */ startvp = NULL; } else { /* * We're here via openat() */ char startchar; if (copyin(fname, &startchar, sizeof (char))) return (set_errno(EFAULT)); /* * if startchar is / then startfd is ignored */ if (startchar == '/') startvp = NULL; else { if ((startfp = getf(startfd)) == NULL) return (set_errno(EBADF)); startvp = startfp->f_vnode; VN_HOLD(startvp); releasef(startfd); } } if (filemode & FXATTR) { /* * Make sure we have a valid request. * We must either have a real fd or AT_FDCWD */ if (startfd != AT_FDCWD && startvp == NULL) { error = EINVAL; goto out; } if (error = pn_get(fname, UIO_USERSPACE, &pn)) { goto out; } if (startfd == AT_FDCWD) { mutex_enter(&p->p_lock); startvp = PTOU(p)->u_cdir; VN_HOLD(startvp); mutex_exit(&p->p_lock); } /* * Verify permission to put attributes on file */ if ((VOP_ACCESS(startvp, VREAD, 0, CRED()) != 0) && (VOP_ACCESS(startvp, VWRITE, 0, CRED()) != 0) && (VOP_ACCESS(startvp, VEXEC, 0, CRED()) != 0)) { error = EACCES; pn_free(&pn); goto out; } if ((startvp->v_vfsp->vfs_flag & VFS_XATTR) != 0) { error = VOP_LOOKUP(startvp, "", &sdvp, &pn, LOOKUP_XATTR|CREATE_XATTR_DIR, rootvp, CRED()); } else { error = EINVAL; } pn_free(&pn); if (error != 0) goto out; VN_RELE(startvp); startvp = sdvp; } if ((filemode & (FREAD|FWRITE)) != 0) { if ((filemode & (FNONBLOCK|FNDELAY)) == (FNONBLOCK|FNDELAY)) filemode &= ~FNDELAY; error = falloc((vnode_t *)NULL, filemode, &fp, &fd); if (error == 0) { #ifdef C2_AUDIT if (audit_active) audit_setfsat_path(1); #endif /* C2_AUDIT */ /* * Last arg is a don't-care term if * !(filemode & FCREAT). */ error = vn_openat(fname, UIO_USERSPACE, filemode, (int)(createmode & MODEMASK), &vp, CRCREAT, u.u_cmask, startvp); if (startvp != NULL) VN_RELE(startvp); if (error == 0) { #ifdef C2_AUDIT if (audit_active) audit_copen(fd, fp, vp); #endif /* C2_AUDIT */ if ((vp->v_flag & VDUP) == 0) { fp->f_vnode = vp; mutex_exit(&fp->f_tlock); /* * We must now fill in the slot * falloc reserved. */ setf(fd, fp); return (fd); } else { /* * Special handling for /dev/fd. * Give up the file pointer * and dup the indicated file descriptor * (in v_rdev). This is ugly, but I've * seen worse. */ unfalloc(fp); dupfd = getminor(vp->v_rdev); type = vp->v_type; mutex_enter(&vp->v_lock); vp->v_flag &= ~VDUP; mutex_exit(&vp->v_lock); VN_RELE(vp); if (type != VCHR) return (set_errno(EINVAL)); if ((fp = getf(dupfd)) == NULL) { setf(fd, NULL); return (set_errno(EBADF)); } mutex_enter(&fp->f_tlock); fp->f_count++; mutex_exit(&fp->f_tlock); setf(fd, fp); releasef(dupfd); } return (fd); } else { setf(fd, NULL); unfalloc(fp); return (set_errno(error)); } } } else { error = EINVAL; } out: if (startvp != NULL) VN_RELE(startvp); return (set_errno(error)); }
int sendsig(int sig, k_siginfo_t *sip, void (*hdlr)()) { volatile int minstacksz; int newstack; label_t ljb; volatile caddr_t sp; caddr_t fp; volatile struct regs *rp; volatile greg_t upc; volatile proc_t *p = ttoproc(curthread); struct as *as = p->p_as; klwp_t *lwp = ttolwp(curthread); ucontext_t *volatile tuc = NULL; ucontext_t *uc; siginfo_t *sip_addr; volatile int watched; /* * This routine is utterly dependent upon STACK_ALIGN being * 16 and STACK_ENTRY_ALIGN being 8. Let's just acknowledge * that and require it. */ #if STACK_ALIGN != 16 || STACK_ENTRY_ALIGN != 8 #error "sendsig() amd64 did not find the expected stack alignments" #endif rp = lwptoregs(lwp); upc = rp->r_pc; /* * Since we're setting up to run the signal handler we have to * arrange that the stack at entry to the handler is (only) * STACK_ENTRY_ALIGN (i.e. 8) byte aligned so that when the handler * executes its push of %rbp, the stack realigns to STACK_ALIGN * (i.e. 16) correctly. * * The new sp will point to the sigframe and the ucontext_t. The * above means that sp (and thus sigframe) will be 8-byte aligned, * but not 16-byte aligned. ucontext_t, however, contains %xmm regs * which must be 16-byte aligned. Because of this, for correct * alignment, sigframe must be a multiple of 8-bytes in length, but * not 16-bytes. This will place ucontext_t at a nice 16-byte boundary. */ /* LINTED: logical expression always true: op "||" */ ASSERT((sizeof (struct sigframe) % 16) == 8); minstacksz = sizeof (struct sigframe) + SA(sizeof (*uc)); if (sip != NULL) minstacksz += SA(sizeof (siginfo_t)); ASSERT((minstacksz & (STACK_ENTRY_ALIGN - 1ul)) == 0); /* * Figure out whether we will be handling this signal on * an alternate stack specified by the user. Then allocate * and validate the stack requirements for the signal handler * context. on_fault will catch any faults. */ newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) && !(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE)); if (newstack) { fp = (caddr_t)(SA((uintptr_t)lwp->lwp_sigaltstack.ss_sp) + SA(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN); } else { /* * Drop below the 128-byte reserved region of the stack frame * we're interrupting. */ fp = (caddr_t)rp->r_sp - STACK_RESERVE; } /* * Force proper stack pointer alignment, even in the face of a * misaligned stack pointer from user-level before the signal. */ fp = (caddr_t)((uintptr_t)fp & ~(STACK_ENTRY_ALIGN - 1ul)); /* * Most of the time during normal execution, the stack pointer * is aligned on a STACK_ALIGN (i.e. 16 byte) boundary. However, * (for example) just after a call instruction (which pushes * the return address), the callers stack misaligns until the * 'push %rbp' happens in the callee prolog. So while we should * expect the stack pointer to be always at least STACK_ENTRY_ALIGN * aligned, we should -not- expect it to always be STACK_ALIGN aligned. * We now adjust to ensure that the new sp is aligned to * STACK_ENTRY_ALIGN but not to STACK_ALIGN. */ sp = fp - minstacksz; if (((uintptr_t)sp & (STACK_ALIGN - 1ul)) == 0) { sp -= STACK_ENTRY_ALIGN; minstacksz = fp - sp; } /* * Now, make sure the resulting signal frame address is sane */ if (sp >= as->a_userlimit || fp >= as->a_userlimit) { #ifdef DEBUG printf("sendsig: bad signal stack cmd=%s, pid=%d, sig=%d\n", PTOU(p)->u_comm, p->p_pid, sig); printf("sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n", (void *)sp, (void *)hdlr, (uintptr_t)upc); printf("sp above USERLIMIT\n"); #endif return (0); } watched = watch_disable_addr((caddr_t)sp, minstacksz, S_WRITE); if (on_fault(&ljb)) goto badstack; if (sip != NULL) { zoneid_t zoneid; fp -= SA(sizeof (siginfo_t)); uzero(fp, sizeof (siginfo_t)); if (SI_FROMUSER(sip) && (zoneid = p->p_zone->zone_id) != GLOBAL_ZONEID && zoneid != sip->si_zoneid) { k_siginfo_t sani_sip = *sip; sani_sip.si_pid = p->p_zone->zone_zsched->p_pid; sani_sip.si_uid = 0; sani_sip.si_ctid = -1; sani_sip.si_zoneid = zoneid; copyout_noerr(&sani_sip, fp, sizeof (sani_sip)); } else copyout_noerr(sip, fp, sizeof (*sip)); sip_addr = (siginfo_t *)fp; if (sig == SIGPROF && curthread->t_rprof != NULL && curthread->t_rprof->rp_anystate) { /* * We stand on our head to deal with * the real time profiling signal. * Fill in the stuff that doesn't fit * in a normal k_siginfo structure. */ int i = sip->si_nsysarg; while (--i >= 0) sulword_noerr( (ulong_t *)&(sip_addr->si_sysarg[i]), (ulong_t)lwp->lwp_arg[i]); copyout_noerr(curthread->t_rprof->rp_state, sip_addr->si_mstate, sizeof (curthread->t_rprof->rp_state)); } } else sip_addr = NULL; /* * save the current context on the user stack directly after the * sigframe. Since sigframe is 8-byte-but-not-16-byte aligned, * and since sizeof (struct sigframe) is 24, this guarantees * 16-byte alignment for ucontext_t and its %xmm registers. */ uc = (ucontext_t *)(sp + sizeof (struct sigframe)); tuc = kmem_alloc(sizeof (*tuc), KM_SLEEP); no_fault(); savecontext(tuc, &lwp->lwp_sigoldmask); if (on_fault(&ljb)) goto badstack; copyout_noerr(tuc, uc, sizeof (*tuc)); kmem_free(tuc, sizeof (*tuc)); tuc = NULL; lwp->lwp_oldcontext = (uintptr_t)uc; if (newstack) { lwp->lwp_sigaltstack.ss_flags |= SS_ONSTACK; if (lwp->lwp_ustack) copyout_noerr(&lwp->lwp_sigaltstack, (stack_t *)lwp->lwp_ustack, sizeof (stack_t)); } /* * Set up signal handler return and stack linkage */ { struct sigframe frame; /* * ensure we never return "normally" */ frame.retaddr = (caddr_t)(uintptr_t)-1L; frame.signo = sig; frame.sip = sip_addr; copyout_noerr(&frame, sp, sizeof (frame)); } no_fault(); if (watched) watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE); /* * Set up user registers for execution of signal handler. */ rp->r_sp = (greg_t)sp; rp->r_pc = (greg_t)hdlr; rp->r_ps = PSL_USER | (rp->r_ps & PS_IOPL); rp->r_rdi = sig; rp->r_rsi = (uintptr_t)sip_addr; rp->r_rdx = (uintptr_t)uc; if ((rp->r_cs & 0xffff) != UCS_SEL || (rp->r_ss & 0xffff) != UDS_SEL) { /* * Try our best to deliver the signal. */ rp->r_cs = UCS_SEL; rp->r_ss = UDS_SEL; } /* * Don't set lwp_eosys here. sendsig() is called via psig() after * lwp_eosys is handled, so setting it here would affect the next * system call. */ return (1); badstack: no_fault(); if (watched) watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE); if (tuc) kmem_free(tuc, sizeof (*tuc)); #ifdef DEBUG printf("sendsig: bad signal stack cmd=%s, pid=%d, sig=%d\n", PTOU(p)->u_comm, p->p_pid, sig); printf("on fault, sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n", (void *)sp, (void *)hdlr, (uintptr_t)upc); #endif return (0); }
/* * The additional flag, LOOKUP_CHECKREAD, is used to enforce artificial * constraints in order to be standards compliant. For example, if we have * the cached path of '/foo/bar', and '/foo' has permissions 100 (execute * only), then we can legitimately look up the path to the current working * directory without needing read permission. Existing standards tests, * however, assume that we are determining the path by repeatedly looking up * "..". We need to keep this behavior in order to maintain backwards * compatibility. */ static int vnodetopath_common(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, cred_t *cr, int flags) { pathname_t pn, rpn; int ret, len; vnode_t *compvp, *pvp, *realvp; proc_t *p = curproc; char path[MAXNAMELEN]; int doclose = 0; /* * If vrootp is NULL, get the root for curproc. Callers with any other * requirements should pass in a different vrootp. */ if (vrootp == NULL) { mutex_enter(&p->p_lock); if ((vrootp = PTOU(p)->u_rdir) == NULL) vrootp = rootdir; VN_HOLD(vrootp); mutex_exit(&p->p_lock); } else { VN_HOLD(vrootp); } /* * This is to get around an annoying artifact of the /proc filesystem, * which is the behavior of {cwd/root}. Trying to resolve this path * will result in /proc/pid/cwd instead of whatever the real working * directory is. We can't rely on VOP_REALVP(), since that will break * lofs. The only difference between procfs and lofs is that opening * the file will return the underling vnode in the case of procfs. */ if (vp->v_type == VDIR && VOP_REALVP(vp, &realvp, NULL) == 0 && realvp != vp) { VN_HOLD(vp); if (VOP_OPEN(&vp, FREAD, cr, NULL) == 0) doclose = 1; else VN_RELE(vp); } pn_alloc(&pn); /* * Check to see if we have a cached path in the vnode. */ mutex_enter(&vp->v_lock); if (vp->v_path != NULL) { (void) pn_set(&pn, vp->v_path); mutex_exit(&vp->v_lock); pn_alloc(&rpn); /* We should only cache absolute paths */ ASSERT(pn.pn_buf[0] == '/'); /* * If we are in a zone or a chroot environment, then we have to * take additional steps, since the path to the root might not * be readable with the current credentials, even though the * process can legitmately access the file. In this case, we * do the following: * * lookuppnvp() with all privileges to get the resolved path. * call localpath() to get the local portion of the path, and * continue as normal. * * If the the conversion to a local path fails, then we continue * as normal. This is a heuristic to make process object file * paths available from within a zone. Because lofs doesn't * support page operations, the vnode stored in the seg_t is * actually the underlying real vnode, not the lofs node itself. * Most of the time, the lofs path is the same as the underlying * vnode (for example, /usr/lib/libc.so.1). */ if (vrootp != rootdir) { char *local = NULL; VN_HOLD(rootdir); if (lookuppnvp(&pn, &rpn, FOLLOW, NULL, &compvp, rootdir, rootdir, kcred) == 0) { local = localpath(rpn.pn_path, vrootp, kcred); VN_RELE(compvp); } /* * The original pn was changed through lookuppnvp(). * Set it to local for next validation attempt. */ if (local) { (void) pn_set(&pn, local); } else { goto notcached; } } /* * We should have a local path at this point, so start the * search from the root of the current process. */ VN_HOLD(vrootp); if (vrootp != rootdir) VN_HOLD(vrootp); ret = lookuppnvp(&pn, &rpn, FOLLOW | flags, NULL, &compvp, vrootp, vrootp, cr); if (ret == 0) { /* * Check to see if the returned vnode is the same as * the one we expect. If not, give up. */ if (!vn_compare(vp, compvp) && !vnode_match(vp, compvp, cr)) { VN_RELE(compvp); goto notcached; } VN_RELE(compvp); /* * Return the result. */ if (buflen <= rpn.pn_pathlen) goto notcached; bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1); pn_free(&pn); pn_free(&rpn); VN_RELE(vrootp); if (doclose) { (void) VOP_CLOSE(vp, FREAD, 1, 0, cr, NULL); VN_RELE(vp); } return (0); } notcached: pn_free(&rpn); } else { mutex_exit(&vp->v_lock); } pn_free(&pn); if (vp->v_type != VDIR) { /* * If we don't have a directory, try to find it in the dnlc via * reverse lookup. Once this is found, we can use the regular * directory search to find the full path. */ if ((pvp = dnlc_reverse_lookup(vp, path, MAXNAMELEN)) != NULL) { /* * Check if we have read privilege so, that * we can lookup the path in the directory */ ret = 0; if ((flags & LOOKUP_CHECKREAD)) { ret = VOP_ACCESS(pvp, VREAD, 0, cr, NULL); } if (ret == 0) { ret = dirtopath(vrootp, pvp, buf, buflen, flags, cr); } if (ret == 0) { len = strlen(buf); if (len + strlen(path) + 1 >= buflen) { ret = ENAMETOOLONG; } else { if (buf[len - 1] != '/') buf[len++] = '/'; bcopy(path, buf + len, strlen(path) + 1); } } VN_RELE(pvp); } else ret = ENOENT; } else ret = dirtopath(vrootp, vp, buf, buflen, flags, cr); VN_RELE(vrootp); if (doclose) { (void) VOP_CLOSE(vp, FREAD, 1, 0, cr, NULL); VN_RELE(vp); } return (ret); }
void main(void) { proc_t *p = ttoproc(curthread); /* &p0 */ int (**initptr)(); extern void sched(); extern void fsflush(); extern int (*init_tbl[])(); extern int (*mp_init_tbl[])(); extern id_t syscid, defaultcid; extern int swaploaded; extern int netboot; extern ib_boot_prop_t *iscsiboot_prop; extern void vm_init(void); extern void cbe_init_pre(void); extern void cbe_init(void); extern void clock_tick_init_pre(void); extern void clock_tick_init_post(void); extern void clock_init(void); extern void physio_bufs_init(void); extern void pm_cfb_setup_intr(void); extern int pm_adjust_timestamps(dev_info_t *, void *); extern void start_other_cpus(int); extern void sysevent_evc_thrinit(); extern kmutex_t ualock; #if defined(__x86) extern void fastboot_post_startup(void); extern void progressbar_start(void); #endif /* * In the horrible world of x86 in-lines, you can't get symbolic * structure offsets a la genassym. This assertion is here so * that the next poor slob who innocently changes the offset of * cpu_thread doesn't waste as much time as I just did finding * out that it's hard-coded in i86/ml/i86.il. Similarly for * curcpup. You're welcome. */ ASSERT(CPU == CPU->cpu_self); ASSERT(curthread == CPU->cpu_thread); ASSERT_STACK_ALIGNED(); /* * We take the ualock until we have completed the startup * to prevent kadmin() from disrupting this work. In particular, * we don't want kadmin() to bring the system down while we are * trying to start it up. */ mutex_enter(&ualock); /* * Setup root lgroup and leaf lgroup for CPU 0 */ lgrp_init(LGRP_INIT_STAGE2); /* * Once 'startup()' completes, the thread_reaper() daemon would be * created(in thread_init()). After that, it is safe to create threads * that could exit. These exited threads will get reaped. */ startup(); segkmem_gc(); callb_init(); cbe_init_pre(); /* x86 must initialize gethrtimef before timer_init */ ddi_periodic_init(); cbe_init(); callout_init(); /* callout table MUST be init'd after cyclics */ clock_tick_init_pre(); clock_init(); #if defined(__x86) /* * The progressbar thread uses cv_reltimedwait() and hence needs to be * started after the callout mechanism has been initialized. */ progressbar_start(); #endif /* * On some platforms, clkinitf() changes the timing source that * gethrtime_unscaled() uses to generate timestamps. cbe_init() calls * clkinitf(), so re-initialize the microstate counters after the * timesource has been chosen. */ init_mstate(&t0, LMS_SYSTEM); init_cpu_mstate(CPU, CMS_SYSTEM); /* * May need to probe to determine latencies from CPU 0 after * gethrtime() comes alive in cbe_init() and before enabling interrupts * and copy and release any temporary memory allocated with BOP_ALLOC() * before release_bootstrap() frees boot memory */ lgrp_init(LGRP_INIT_STAGE3); /* * Call all system initialization functions. */ for (initptr = &init_tbl[0]; *initptr; initptr++) (**initptr)(); /* * Load iSCSI boot properties */ ld_ib_prop(); /* * initialize vm related stuff. */ vm_init(); /* * initialize buffer pool for raw I/O requests */ physio_bufs_init(); ttolwp(curthread)->lwp_error = 0; /* XXX kludge for SCSI driver */ /* * Drop the interrupt level and allow interrupts. At this point * the DDI guarantees that interrupts are enabled. */ (void) spl0(); interrupts_unleashed = 1; /* * Create kmem cache for proc structures */ process_cache = kmem_cache_create("process_cache", sizeof (proc_t), 0, NULL, NULL, NULL, NULL, NULL, 0); vfs_mountroot(); /* Mount the root file system */ errorq_init(); /* after vfs_mountroot() so DDI root is ready */ cpu_kstat_init(CPU); /* after vfs_mountroot() so TOD is valid */ ddi_walk_devs(ddi_root_node(), pm_adjust_timestamps, NULL); /* after vfs_mountroot() so hrestime is valid */ post_startup(); swaploaded = 1; /* * Initialize Solaris Audit Subsystem */ audit_init(); /* * Plumb the protocol modules and drivers only if we are not * networked booted, in this case we already did it in rootconf(). */ if (netboot == 0 && iscsiboot_prop == NULL) (void) strplumb(); gethrestime(&PTOU(curproc)->u_start); curthread->t_start = PTOU(curproc)->u_start.tv_sec; p->p_mstart = gethrtime(); /* * Perform setup functions that can only be done after root * and swap have been set up. */ consconfig(); #ifndef __sparc release_bootstrap(); #endif /* * attach drivers with ddi-forceattach prop * It must be done early enough to load hotplug drivers (e.g. * pcmcia nexus) so that devices enumerated via hotplug is * available before I/O subsystem is fully initialized. */ i_ddi_forceattach_drivers(); /* * Set the scan rate and other parameters of the paging subsystem. */ setupclock(0); /* * Initialize process 0's lwp directory and lwpid hash table. */ p->p_lwpdir = p->p_lwpfree = p0_lwpdir; p->p_lwpdir->ld_next = p->p_lwpdir + 1; p->p_lwpdir_sz = 2; p->p_tidhash = p0_tidhash; p->p_tidhash_sz = 2; p0_lep.le_thread = curthread; p0_lep.le_lwpid = curthread->t_tid; p0_lep.le_start = curthread->t_start; lwp_hash_in(p, &p0_lep, p0_tidhash, 2, 0); /* * Initialize extended accounting. */ exacct_init(); /* * Initialize threads of sysevent event channels */ sysevent_evc_thrinit(); /* * This must be done after post_startup() but before * start_other_cpus() */ lgrp_init(LGRP_INIT_STAGE4); /* * Perform MP initialization, if any. */ start_other_cpus(0); #ifdef __sparc /* * Release bootstrap here since PROM interfaces are * used to start other CPUs above. */ release_bootstrap(); #endif /* * Finish lgrp initialization after all CPUS are brought online. */ lgrp_init(LGRP_INIT_STAGE5); /* * After mp_init(), number of cpus are known (this is * true for the time being, when there are actually * hot pluggable cpus then this scheme would not do). * Any per cpu initialization is done here. */ kmem_mp_init(); vmem_update(NULL); clock_tick_init_post(); for (initptr = &mp_init_tbl[0]; *initptr; initptr++) (**initptr)(); /* * These must be called after start_other_cpus */ pm_cfb_setup_intr(); #if defined(__x86) fastboot_post_startup(); #endif /* * Make init process; enter scheduling loop with system process. * * Note that we manually assign the pids for these processes, for * historical reasons. If more pre-assigned pids are needed, * FAMOUS_PIDS will have to be updated. */ /* create init process */ if (newproc(start_init, NULL, defaultcid, 59, NULL, FAMOUS_PID_INIT)) panic("main: unable to fork init."); /* create pageout daemon */ if (newproc(pageout, NULL, syscid, maxclsyspri - 1, NULL, FAMOUS_PID_PAGEOUT)) panic("main: unable to fork pageout()"); /* create fsflush daemon */ if (newproc(fsflush, NULL, syscid, minclsyspri, NULL, FAMOUS_PID_FSFLUSH)) panic("main: unable to fork fsflush()"); /* create cluster process if we're a member of one */ if (cluster_bootflags & CLUSTER_BOOTED) { if (newproc(cluster_wrapper, NULL, syscid, minclsyspri, NULL, 0)) { panic("main: unable to fork cluster()"); } } /* * Create system threads (threads are associated with p0) */ /* create module uninstall daemon */ /* BugID 1132273. If swapping over NFS need a bigger stack */ (void) thread_create(NULL, 0, (void (*)())mod_uninstall_daemon, NULL, 0, &p0, TS_RUN, minclsyspri); (void) thread_create(NULL, 0, seg_pasync_thread, NULL, 0, &p0, TS_RUN, minclsyspri); pid_setmin(); /* system is now ready */ mutex_exit(&ualock); bcopy("sched", PTOU(curproc)->u_psargs, 6); bcopy("sched", PTOU(curproc)->u_comm, 5); sched(); /* NOTREACHED */ }
static int copen(int startfd, char *fname, int filemode, int createmode) { struct pathname pn; vnode_t *vp, *sdvp; file_t *fp, *startfp; enum vtype type; int error; int fd, dupfd; vnode_t *startvp; proc_t *p = curproc; uio_seg_t seg = UIO_USERSPACE; char *open_filename = fname; uint32_t auditing = AU_AUDITING(); char startchar; if (filemode & (FSEARCH|FEXEC)) { /* * Must be one or the other and neither FREAD nor FWRITE * Must not be any of FAPPEND FCREAT FTRUNC FXATTR FXATTRDIROPEN * XXX: Should these just be silently ignored? */ if ((filemode & (FREAD|FWRITE)) || (filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) || (filemode & (FAPPEND|FCREAT|FTRUNC|FXATTR|FXATTRDIROPEN))) return (set_errno(EINVAL)); } if (startfd == AT_FDCWD) { /* * Regular open() */ startvp = NULL; } else { /* * We're here via openat() */ if (copyin(fname, &startchar, sizeof (char))) return (set_errno(EFAULT)); /* * if startchar is / then startfd is ignored */ if (startchar == '/') startvp = NULL; else { if ((startfp = getf(startfd)) == NULL) return (set_errno(EBADF)); startvp = startfp->f_vnode; VN_HOLD(startvp); releasef(startfd); } } /* * Handle __openattrdirat() requests */ if (filemode & FXATTRDIROPEN) { if (auditing && startvp != NULL) audit_setfsat_path(1); if (error = lookupnameat(fname, seg, FOLLOW, NULLVPP, &vp, startvp)) return (set_errno(error)); if (startvp != NULL) VN_RELE(startvp); startvp = vp; } /* * Do we need to go into extended attribute space? */ if (filemode & FXATTR) { if (startfd == AT_FDCWD) { if (copyin(fname, &startchar, sizeof (char))) return (set_errno(EFAULT)); /* * If startchar == '/' then no extended attributes * are looked up. */ if (startchar == '/') { startvp = NULL; } else { mutex_enter(&p->p_lock); startvp = PTOU(p)->u_cdir; VN_HOLD(startvp); mutex_exit(&p->p_lock); } } /* * Make sure we have a valid extended attribute request. * We must either have a real fd or AT_FDCWD and a relative * pathname. */ if (startvp == NULL) { goto noxattr; } } if (filemode & (FXATTR|FXATTRDIROPEN)) { vattr_t vattr; if (error = pn_get(fname, UIO_USERSPACE, &pn)) { goto out; } /* * In order to access hidden attribute directory the * user must be able to stat() the file */ vattr.va_mask = AT_ALL; if (error = VOP_GETATTR(startvp, &vattr, 0, CRED(), NULL)) { pn_free(&pn); goto out; } if ((startvp->v_vfsp->vfs_flag & VFS_XATTR) != 0 || vfs_has_feature(startvp->v_vfsp, VFSFT_SYSATTR_VIEWS)) { error = VOP_LOOKUP(startvp, "", &sdvp, &pn, (filemode & FXATTRDIROPEN) ? LOOKUP_XATTR : LOOKUP_XATTR|CREATE_XATTR_DIR, rootvp, CRED(), NULL, NULL, NULL); } else { error = EINVAL; } /* * For __openattrdirat() use "." as filename to open * as part of vn_openat() */ if (error == 0 && (filemode & FXATTRDIROPEN)) { open_filename = "."; seg = UIO_SYSSPACE; } pn_free(&pn); if (error != 0) goto out; VN_RELE(startvp); startvp = sdvp; } noxattr: if ((filemode & (FREAD|FWRITE|FSEARCH|FEXEC|FXATTRDIROPEN)) != 0) { if ((filemode & (FNONBLOCK|FNDELAY)) == (FNONBLOCK|FNDELAY)) filemode &= ~FNDELAY; error = falloc((vnode_t *)NULL, filemode, &fp, &fd); if (error == 0) { if (auditing && startvp != NULL) audit_setfsat_path(1); /* * Last arg is a don't-care term if * !(filemode & FCREAT). */ error = vn_openat(open_filename, seg, filemode, (int)(createmode & MODEMASK), &vp, CRCREAT, PTOU(curproc)->u_cmask, startvp, fd); if (startvp != NULL) VN_RELE(startvp); if (error == 0) { if ((vp->v_flag & VDUP) == 0) { fp->f_vnode = vp; mutex_exit(&fp->f_tlock); /* * We must now fill in the slot * falloc reserved. */ setf(fd, fp); return (fd); } else { /* * Special handling for /dev/fd. * Give up the file pointer * and dup the indicated file descriptor * (in v_rdev). This is ugly, but I've * seen worse. */ unfalloc(fp); dupfd = getminor(vp->v_rdev); type = vp->v_type; mutex_enter(&vp->v_lock); vp->v_flag &= ~VDUP; mutex_exit(&vp->v_lock); VN_RELE(vp); if (type != VCHR) return (set_errno(EINVAL)); if ((fp = getf(dupfd)) == NULL) { setf(fd, NULL); return (set_errno(EBADF)); } mutex_enter(&fp->f_tlock); fp->f_count++; mutex_exit(&fp->f_tlock); setf(fd, fp); releasef(dupfd); } return (fd); } else { setf(fd, NULL); unfalloc(fp); return (set_errno(error)); } } } else { error = EINVAL; } out: if (startvp != NULL) VN_RELE(startvp); return (set_errno(error)); }
/* * setppriv (priv_op_t, priv_ptype_t, priv_set_t) */ static int setppriv(priv_op_t op, priv_ptype_t type, priv_set_t *in_pset) { priv_set_t pset, *target; cred_t *cr, *pcr; proc_t *p; boolean_t donocd = B_FALSE; if (!PRIV_VALIDSET(type) || !PRIV_VALIDOP(op)) return (set_errno(EINVAL)); if (copyin(in_pset, &pset, sizeof (priv_set_t))) return (set_errno(EFAULT)); p = ttoproc(curthread); cr = cralloc(); mutex_enter(&p->p_crlock); retry: pcr = p->p_cred; if (AU_AUDITING()) audit_setppriv(op, type, &pset, pcr); /* * Filter out unallowed request (bad op and bad type) */ switch (op) { case PRIV_ON: case PRIV_SET: /* * Turning on privileges; the limit set cannot grow, * other sets can but only as long as they remain subsets * of P. Only immediately after exec holds that P <= L. */ if (type == PRIV_LIMIT && !priv_issubset(&pset, &CR_LPRIV(pcr))) { mutex_exit(&p->p_crlock); crfree(cr); return (set_errno(EPERM)); } if (!priv_issubset(&pset, &CR_OPPRIV(pcr)) && !priv_issubset(&pset, priv_getset(pcr, type))) { mutex_exit(&p->p_crlock); /* Policy override should not grow beyond L either */ if (type != PRIV_INHERITABLE || !priv_issubset(&pset, &CR_LPRIV(pcr)) || secpolicy_require_privs(CRED(), &pset) != 0) { crfree(cr); return (set_errno(EPERM)); } mutex_enter(&p->p_crlock); if (pcr != p->p_cred) goto retry; donocd = B_TRUE; } break; case PRIV_OFF: /* PRIV_OFF is always allowed */ break; } /* * OK! everything is cool. * Do cred COW. */ crcopy_to(pcr, cr); /* * If we change the effective, permitted or limit set, we attain * "privilege awareness". */ if (type != PRIV_INHERITABLE) priv_set_PA(cr); target = &(CR_PRIVS(cr)->crprivs[type]); switch (op) { case PRIV_ON: priv_union(&pset, target); break; case PRIV_OFF: priv_inverse(&pset); priv_intersect(target, &pset); /* * Fall-thru to set target and change other process * privilege sets. */ /*FALLTHRU*/ case PRIV_SET: *target = pset; /* * Take privileges no longer permitted out * of other effective sets as well. * Limit set is enforced at exec() time. */ if (type == PRIV_PERMITTED) priv_intersect(&pset, &CR_EPRIV(cr)); break; } /* * When we give up privileges not in the inheritable set, * set SNOCD if not already set; first we compute the * privileges removed from P using Diff = (~P') & P * and then we check whether the removed privileges are * a subset of I. If we retain uid 0, all privileges * are required anyway so don't set SNOCD. */ if (type == PRIV_PERMITTED && (p->p_flag & SNOCD) == 0 && cr->cr_uid != 0 && cr->cr_ruid != 0 && cr->cr_suid != 0) { priv_set_t diff = CR_OPPRIV(cr); priv_inverse(&diff); priv_intersect(&CR_OPPRIV(pcr), &diff); donocd = !priv_issubset(&diff, &CR_IPRIV(cr)); } p->p_cred = cr; mutex_exit(&p->p_crlock); if (donocd) { mutex_enter(&p->p_lock); p->p_flag |= SNOCD; mutex_exit(&p->p_lock); } /* * The basic_test privilege should not be removed from E; * if that has happened, then some programmer typically set the E/P to * empty. That is not portable. */ if ((type == PRIV_EFFECTIVE || type == PRIV_PERMITTED) && priv_basic_test >= 0 && !PRIV_ISASSERT(target, priv_basic_test)) { proc_t *p = curproc; pid_t pid = p->p_pid; char *fn = PTOU(p)->u_comm; cmn_err(CE_WARN, "%s[%d]: setppriv: basic_test privilege " "removed from E/P", fn, pid); } crset(p, cr); /* broadcast to process threads */ return (0); }
int dogetcwd(char *buf, size_t buflen) { int ret; vnode_t *vp; vnode_t *compvp; refstr_t *cwd, *oldcwd; const char *value; pathname_t rpnp, pnp; proc_t *p = curproc; /* * Check to see if there is a cached version of the cwd. If so, lookup * the cached value and make sure it is the same vnode. */ mutex_enter(&p->p_lock); if ((cwd = PTOU(p)->u_cwd) != NULL) refstr_hold(cwd); vp = PTOU(p)->u_cdir; VN_HOLD(vp); mutex_exit(&p->p_lock); /* * Make sure we have permission to access the current directory. */ if ((ret = VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) != 0) { if (cwd != NULL) refstr_rele(cwd); VN_RELE(vp); return (ret); } if (cwd) { value = refstr_value(cwd); if ((ret = pn_get((char *)value, UIO_SYSSPACE, &pnp)) != 0) { refstr_rele(cwd); VN_RELE(vp); return (ret); } pn_alloc(&rpnp); if (lookuppn(&pnp, &rpnp, NO_FOLLOW, NULL, &compvp) == 0) { if (VN_CMP(vp, compvp) && strcmp(value, rpnp.pn_path) == 0) { VN_RELE(compvp); VN_RELE(vp); pn_free(&pnp); pn_free(&rpnp); if (strlen(value) + 1 > buflen) { refstr_rele(cwd); return (ENAMETOOLONG); } bcopy(value, buf, strlen(value) + 1); refstr_rele(cwd); return (0); } VN_RELE(compvp); } pn_free(&rpnp); pn_free(&pnp); refstr_rele(cwd); } ret = vnodetopath_common(NULL, vp, buf, buflen, CRED(), LOOKUP_CHECKREAD); VN_RELE(vp); /* * Store the new cwd and replace the existing cached copy. */ if (ret == 0) cwd = refstr_alloc(buf); else cwd = NULL; mutex_enter(&p->p_lock); oldcwd = PTOU(p)->u_cwd; PTOU(p)->u_cwd = cwd; mutex_exit(&p->p_lock); if (oldcwd) refstr_rele(oldcwd); return (ret); }
int kadmin(int cmd, int fcn, void *mdep, cred_t *credp) { int error = 0; char *buf; size_t buflen = 0; boolean_t invoke_cb = B_FALSE; /* * We might be called directly by the kernel's fault-handling code, so * we can't assert that the caller is in the global zone. */ /* * Make sure that cmd is one of the valid <sys/uadmin.h> command codes * and that we have appropriate privileges for this action. */ switch (cmd) { case A_FTRACE: case A_SHUTDOWN: case A_REBOOT: case A_REMOUNT: case A_FREEZE: case A_DUMP: case A_SDTTEST: case A_CONFIG: if (secpolicy_sys_config(credp, B_FALSE) != 0) return (EPERM); break; default: return (EINVAL); } /* * Serialize these operations on ualock. If it is held, the * system should shutdown, reboot, or remount shortly, unless there is * an error. We need a cv rather than just a mutex because proper * functioning of A_REBOOT relies on being able to interrupt blocked * userland callers. * * We only clear ua_shutdown_thread after A_REMOUNT or A_CONFIG. * Other commands should never return. */ if (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_REMOUNT || cmd == A_CONFIG) { mutex_enter(&ualock); while (ua_shutdown_thread != NULL) { if (cv_wait_sig(&uacond, &ualock) == 0) { /* * If we were interrupted, leave, and handle * the signal (or exit, depending on what * happened) */ mutex_exit(&ualock); return (EINTR); } } ua_shutdown_thread = curthread; mutex_exit(&ualock); } switch (cmd) { case A_SHUTDOWN: { proc_t *p = ttoproc(curthread); /* * Release (almost) all of our own resources if we are called * from a user context, however if we are calling kadmin() from * a kernel context then we do not release these resources. */ if (p != &p0) { proc_is_exiting(p); if ((error = exitlwps(0)) != 0) { /* * Another thread in this process also called * exitlwps(). */ mutex_enter(&ualock); ua_shutdown_thread = NULL; cv_signal(&uacond); mutex_exit(&ualock); return (error); } mutex_enter(&p->p_lock); p->p_flag |= SNOWAIT; sigfillset(&p->p_ignore); curthread->t_lwp->lwp_cursig = 0; curthread->t_lwp->lwp_extsig = 0; if (p->p_exec) { vnode_t *exec_vp = p->p_exec; p->p_exec = NULLVP; mutex_exit(&p->p_lock); VN_RELE(exec_vp); } else { mutex_exit(&p->p_lock); } pollcleanup(); closeall(P_FINFO(curproc)); relvm(); } else { /* * Reset t_cred if not set because much of the * filesystem code depends on CRED() being valid. */ if (curthread->t_cred == NULL) curthread->t_cred = kcred; } /* indicate shutdown in progress */ sys_shutdown = 1; /* * Communcate that init shouldn't be restarted. */ zone_shutdown_global(); killall(ALL_ZONES); /* * If we are calling kadmin() from a kernel context then we * do not release these resources. */ if (ttoproc(curthread) != &p0) { VN_RELE(PTOU(curproc)->u_cdir); if (PTOU(curproc)->u_rdir) VN_RELE(PTOU(curproc)->u_rdir); if (PTOU(curproc)->u_cwd) refstr_rele(PTOU(curproc)->u_cwd); PTOU(curproc)->u_cdir = rootdir; PTOU(curproc)->u_rdir = NULL; PTOU(curproc)->u_cwd = NULL; } /* * Allow the reboot/halt/poweroff code a chance to do * anything it needs to whilst we still have filesystems * mounted, like loading any modules necessary for later * performing the actual poweroff. */ if ((mdep != NULL) && (*(char *)mdep == '/')) { buf = i_convert_boot_device_name(mdep, NULL, &buflen); mdpreboot(cmd, fcn, buf); } else mdpreboot(cmd, fcn, mdep); /* * Allow fsflush to finish running and then prevent it * from ever running again so that vfs_unmountall() and * vfs_syncall() can acquire the vfs locks they need. */ sema_p(&fsflush_sema); (void) callb_execute_class(CB_CL_UADMIN_PRE_VFS, NULL); vfs_unmountall(); (void) VFS_MOUNTROOT(rootvfs, ROOT_UNMOUNT); vfs_syncall(); dump_ereports(); dump_messages(); invoke_cb = B_TRUE; /* FALLTHROUGH */ } case A_REBOOT: if ((mdep != NULL) && (*(char *)mdep == '/')) { buf = i_convert_boot_device_name(mdep, NULL, &buflen); mdboot(cmd, fcn, buf, invoke_cb); } else mdboot(cmd, fcn, mdep, invoke_cb); /* no return expected */ break; case A_CONFIG: switch (fcn) { case AD_UPDATE_BOOT_CONFIG: #ifndef __sparc { extern void fastboot_update_config(const char *); fastboot_update_config(mdep); } #endif break; } /* Let other threads enter the shutdown path now */ mutex_enter(&ualock); ua_shutdown_thread = NULL; cv_signal(&uacond); mutex_exit(&ualock); break; case A_REMOUNT: (void) VFS_MOUNTROOT(rootvfs, ROOT_REMOUNT); /* Let other threads enter the shutdown path now */ mutex_enter(&ualock); ua_shutdown_thread = NULL; cv_signal(&uacond); mutex_exit(&ualock); break; case A_FREEZE: { /* * This is the entrypoint for all suspend/resume actions. */ extern int cpr(int, void *); if (modload("misc", "cpr") == -1) return (ENOTSUP); /* Let the CPR module decide what to do with mdep */ error = cpr(fcn, mdep); break; } case A_FTRACE: { switch (fcn) { case AD_FTRACE_START: (void) FTRACE_START(); break; case AD_FTRACE_STOP: (void) FTRACE_STOP(); break; default: error = EINVAL; } break; } case A_DUMP: { if (fcn == AD_NOSYNC) { in_sync = 1; break; } panic_bootfcn = fcn; panic_forced = 1; if ((mdep != NULL) && (*(char *)mdep == '/')) { panic_bootstr = i_convert_boot_device_name(mdep, NULL, &buflen); } else panic_bootstr = mdep; #ifndef __sparc extern void fastboot_update_and_load(int, char *); fastboot_update_and_load(fcn, mdep); #endif panic("forced crash dump initiated at user request"); /*NOTREACHED*/ } case A_SDTTEST: { DTRACE_PROBE7(test, int, 1, int, 2, int, 3, int, 4, int, 5, int, 6, int, 7); break; } default: error = EINVAL; } return (error); }
/* * Remove zombie children from the process table. */ void freeproc(proc_t *p) { proc_t *q; task_t *tk; ASSERT(p->p_stat == SZOMB); ASSERT(p->p_tlist == NULL); ASSERT(MUTEX_HELD(&pidlock)); sigdelq(p, NULL, 0); if (p->p_killsqp) { siginfofree(p->p_killsqp); p->p_killsqp = NULL; } prfree(p); /* inform /proc */ /* * Don't free the init processes. * Other dying processes will access it. */ if (p == proc_init) return; /* * We wait until now to free the cred structure because a * zombie process's credentials may be examined by /proc. * No cred locking needed because there are no threads at this point. */ upcount_dec(crgetruid(p->p_cred), crgetzoneid(p->p_cred)); crfree(p->p_cred); if (p->p_corefile != NULL) { corectl_path_rele(p->p_corefile); p->p_corefile = NULL; } if (p->p_content != NULL) { corectl_content_rele(p->p_content); p->p_content = NULL; } if (p->p_nextofkin && !((p->p_nextofkin->p_flag & SNOWAIT) || (PTOU(p->p_nextofkin)->u_signal[SIGCLD - 1] == SIG_IGN))) { /* * This should still do the right thing since p_utime/stime * get set to the correct value on process exit, so it * should get properly updated */ p->p_nextofkin->p_cutime += p->p_utime; p->p_nextofkin->p_cstime += p->p_stime; p->p_nextofkin->p_cacct[LMS_USER] += p->p_acct[LMS_USER]; p->p_nextofkin->p_cacct[LMS_SYSTEM] += p->p_acct[LMS_SYSTEM]; p->p_nextofkin->p_cacct[LMS_TRAP] += p->p_acct[LMS_TRAP]; p->p_nextofkin->p_cacct[LMS_TFAULT] += p->p_acct[LMS_TFAULT]; p->p_nextofkin->p_cacct[LMS_DFAULT] += p->p_acct[LMS_DFAULT]; p->p_nextofkin->p_cacct[LMS_KFAULT] += p->p_acct[LMS_KFAULT]; p->p_nextofkin->p_cacct[LMS_USER_LOCK] += p->p_acct[LMS_USER_LOCK]; p->p_nextofkin->p_cacct[LMS_SLEEP] += p->p_acct[LMS_SLEEP]; p->p_nextofkin->p_cacct[LMS_WAIT_CPU] += p->p_acct[LMS_WAIT_CPU]; p->p_nextofkin->p_cacct[LMS_STOPPED] += p->p_acct[LMS_STOPPED]; p->p_nextofkin->p_cru.minflt += p->p_ru.minflt; p->p_nextofkin->p_cru.majflt += p->p_ru.majflt; p->p_nextofkin->p_cru.nswap += p->p_ru.nswap; p->p_nextofkin->p_cru.inblock += p->p_ru.inblock; p->p_nextofkin->p_cru.oublock += p->p_ru.oublock; p->p_nextofkin->p_cru.msgsnd += p->p_ru.msgsnd; p->p_nextofkin->p_cru.msgrcv += p->p_ru.msgrcv; p->p_nextofkin->p_cru.nsignals += p->p_ru.nsignals; p->p_nextofkin->p_cru.nvcsw += p->p_ru.nvcsw; p->p_nextofkin->p_cru.nivcsw += p->p_ru.nivcsw; p->p_nextofkin->p_cru.sysc += p->p_ru.sysc; p->p_nextofkin->p_cru.ioch += p->p_ru.ioch; } q = p->p_nextofkin; if (q && q->p_orphan == p) q->p_orphan = p->p_nextorph; else if (q) { for (q = q->p_orphan; q; q = q->p_nextorph) if (q->p_nextorph == p) break; ASSERT(q && q->p_nextorph == p); q->p_nextorph = p->p_nextorph; } /* * The process table slot is being freed, so it is now safe to give up * task and project membership. */ mutex_enter(&p->p_lock); tk = p->p_task; task_detach(p); mutex_exit(&p->p_lock); proc_detach(p); pid_exit(p, tk); /* frees pid and proc structure */ task_rele(tk); }
int sigaction(int sig, struct sigaction *actp, struct sigaction *oactp) { struct sigaction act; struct sigaction oact; k_sigset_t set; proc_t *p; user_t *ua; int sigcld_look = 0; if (sig <= 0 || sig >= NSIG || (actp != NULL && sigismember(&cantmask, sig))) return (set_errno(EINVAL)); /* * act and oact might be the same address, so copyin act first. */ if (actp) { #if defined(__sparc) void (*handler)(); #endif if (copyin(actp, &act, sizeof (act))) return (set_errno(EFAULT)); #if defined(__sparc) /* * Check alignment of handler */ handler = act.sa_handler; if (handler != SIG_IGN && handler != SIG_DFL && ((uintptr_t)handler & 0x3) != 0) return (set_errno(EINVAL)); #endif } p = curproc; ua = PTOU(p); mutex_enter(&p->p_lock); if (oactp) { int flags; void (*disp)(); disp = ua->u_signal[sig - 1]; flags = 0; if (disp != SIG_DFL && disp != SIG_IGN) { set = ua->u_sigmask[sig-1]; if (sigismember(&p->p_siginfo, sig)) flags |= SA_SIGINFO; if (sigismember(&ua->u_sigrestart, sig)) flags |= SA_RESTART; if (sigismember(&ua->u_sigonstack, sig)) flags |= SA_ONSTACK; if (sigismember(&ua->u_sigresethand, sig)) flags |= SA_RESETHAND; if (sigismember(&ua->u_signodefer, sig)) flags |= SA_NODEFER; } else sigemptyset(&set); if (sig == SIGCLD) { if (p->p_flag & SNOWAIT) flags |= SA_NOCLDWAIT; if (!(p->p_flag & SJCTL)) flags |= SA_NOCLDSTOP; } oact.sa_handler = disp; oact.sa_flags = flags; sigktou(&set, &oact.sa_mask); } if (actp) { if (sig == SIGCLD) sigcld_look = 1; sigutok(&act.sa_mask, &set); setsigact(sig, act.sa_handler, &set, act.sa_flags); } mutex_exit(&p->p_lock); if (sigcld_look) sigcld_repost(); if (oactp && copyout(&oact, oactp, sizeof (oact))) return (set_errno(EFAULT)); return (0); }
/* * Called by proc_exit() when a zone's init exits, presumably because * it failed. As long as the given zone is still in the "running" * state, we will re-exec() init, but first we need to reset things * which are usually inherited across exec() but will break init's * assumption that it is being exec()'d from a virgin process. Most * importantly this includes closing all file descriptors (exec only * closes those marked close-on-exec) and resetting signals (exec only * resets handled signals, and we need to clear any signals which * killed init). Anything else that exec(2) says would be inherited, * but would affect the execution of init, needs to be reset. */ static int restart_init(int what, int why) { kthread_t *t = curthread; klwp_t *lwp = ttolwp(t); proc_t *p = ttoproc(t); user_t *up = PTOU(p); vnode_t *oldcd, *oldrd; int i, err; char reason_buf[64]; /* * Let zone admin (and global zone admin if this is for a non-global * zone) know that init has failed and will be restarted. */ zcmn_err(p->p_zone->zone_id, CE_WARN, "init(1M) %s: restarting automatically", exit_reason(reason_buf, sizeof (reason_buf), what, why)); if (!INGLOBALZONE(p)) { cmn_err(CE_WARN, "init(1M) for zone %s (pid %d) %s: " "restarting automatically", p->p_zone->zone_name, p->p_pid, reason_buf); } /* * Remove any fpollinfo_t's for this (last) thread from our file * descriptors so closeall() can ASSERT() that they're all gone. * Then close all open file descriptors in the process. */ pollcleanup(); closeall(P_FINFO(p)); /* * Grab p_lock and begin clearing miscellaneous global process * state that needs to be reset before we exec the new init(1M). */ mutex_enter(&p->p_lock); prbarrier(p); p->p_flag &= ~(SKILLED | SEXTKILLED | SEXITING | SDOCORE); up->u_cmask = CMASK; sigemptyset(&t->t_hold); sigemptyset(&t->t_sig); sigemptyset(&t->t_extsig); sigemptyset(&p->p_sig); sigemptyset(&p->p_extsig); sigdelq(p, t, 0); sigdelq(p, NULL, 0); if (p->p_killsqp) { siginfofree(p->p_killsqp); p->p_killsqp = NULL; } /* * Reset any signals that are ignored back to the default disposition. * Other u_signal members will be cleared when exec calls sigdefault(). */ for (i = 1; i < NSIG; i++) { if (up->u_signal[i - 1] == SIG_IGN) { up->u_signal[i - 1] = SIG_DFL; sigemptyset(&up->u_sigmask[i - 1]); } } /* * Clear the current signal, any signal info associated with it, and * any signal information from contracts and/or contract templates. */ lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; if (lwp->lwp_curinfo != NULL) { siginfofree(lwp->lwp_curinfo); lwp->lwp_curinfo = NULL; } lwp_ctmpl_clear(lwp); /* * Reset both the process root directory and the current working * directory to the root of the zone just as we do during boot. */ VN_HOLD(p->p_zone->zone_rootvp); oldrd = up->u_rdir; up->u_rdir = p->p_zone->zone_rootvp; VN_HOLD(p->p_zone->zone_rootvp); oldcd = up->u_cdir; up->u_cdir = p->p_zone->zone_rootvp; if (up->u_cwd != NULL) { refstr_rele(up->u_cwd); up->u_cwd = NULL; } mutex_exit(&p->p_lock); if (oldrd != NULL) VN_RELE(oldrd); if (oldcd != NULL) VN_RELE(oldcd); /* Free the controlling tty. (freectty() always assumes curproc.) */ ASSERT(p == curproc); (void) freectty(B_TRUE); /* * Now exec() the new init(1M) on top of the current process. If we * succeed, the caller will treat this like a successful system call. * If we fail, we issue messages and the caller will proceed with exit. */ err = exec_init(p->p_zone->zone_initname, NULL); if (err == 0) return (0); zcmn_err(p->p_zone->zone_id, CE_WARN, "failed to restart init(1M) (err=%d): system reboot required", err); if (!INGLOBALZONE(p)) { cmn_err(CE_WARN, "failed to restart init(1M) for zone %s " "(pid %d, err=%d): zoneadm(1M) boot required", p->p_zone->zone_name, p->p_pid, err); } return (-1); }
int sendsig(int sig, k_siginfo_t *sip, void (*hdlr)()) { volatile int minstacksz; int newstack; label_t ljb; volatile caddr_t sp; caddr_t fp; struct regs *rp; volatile greg_t upc; volatile proc_t *p = ttoproc(curthread); klwp_t *lwp = ttolwp(curthread); ucontext_t *volatile tuc = NULL; ucontext_t *uc; siginfo_t *sip_addr; volatile int watched; rp = lwptoregs(lwp); upc = rp->r_pc; minstacksz = SA(sizeof (struct sigframe)) + SA(sizeof (*uc)); if (sip != NULL) minstacksz += SA(sizeof (siginfo_t)); ASSERT((minstacksz & (STACK_ALIGN - 1ul)) == 0); /* * Figure out whether we will be handling this signal on * an alternate stack specified by the user. Then allocate * and validate the stack requirements for the signal handler * context. on_fault will catch any faults. */ newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) && !(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE)); if (newstack) { fp = (caddr_t)(SA((uintptr_t)lwp->lwp_sigaltstack.ss_sp) + SA(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN); } else if ((rp->r_ss & 0xffff) != UDS_SEL) { user_desc_t *ldt; /* * If the stack segment selector is -not- pointing at * the UDS_SEL descriptor and we have an LDT entry for * it instead, add the base address to find the effective va. */ if ((ldt = p->p_ldt) != NULL) fp = (caddr_t)rp->r_sp + USEGD_GETBASE(&ldt[SELTOIDX(rp->r_ss)]); else fp = (caddr_t)rp->r_sp; } else fp = (caddr_t)rp->r_sp; /* * Force proper stack pointer alignment, even in the face of a * misaligned stack pointer from user-level before the signal. * Don't use the SA() macro because that rounds up, not down. */ fp = (caddr_t)((uintptr_t)fp & ~(STACK_ALIGN - 1ul)); sp = fp - minstacksz; /* * Make sure lwp hasn't trashed its stack. */ if (sp >= (caddr_t)USERLIMIT || fp >= (caddr_t)USERLIMIT) { #ifdef DEBUG printf("sendsig: bad signal stack cmd=%s, pid=%d, sig=%d\n", PTOU(p)->u_comm, p->p_pid, sig); printf("sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n", (void *)sp, (void *)hdlr, (uintptr_t)upc); printf("sp above USERLIMIT\n"); #endif return (0); } watched = watch_disable_addr((caddr_t)sp, minstacksz, S_WRITE); if (on_fault(&ljb)) goto badstack; if (sip != NULL) { zoneid_t zoneid; fp -= SA(sizeof (siginfo_t)); uzero(fp, sizeof (siginfo_t)); if (SI_FROMUSER(sip) && (zoneid = p->p_zone->zone_id) != GLOBAL_ZONEID && zoneid != sip->si_zoneid) { k_siginfo_t sani_sip = *sip; sani_sip.si_pid = p->p_zone->zone_zsched->p_pid; sani_sip.si_uid = 0; sani_sip.si_ctid = -1; sani_sip.si_zoneid = zoneid; copyout_noerr(&sani_sip, fp, sizeof (sani_sip)); } else copyout_noerr(sip, fp, sizeof (*sip)); sip_addr = (siginfo_t *)fp; if (sig == SIGPROF && curthread->t_rprof != NULL && curthread->t_rprof->rp_anystate) { /* * We stand on our head to deal with * the real time profiling signal. * Fill in the stuff that doesn't fit * in a normal k_siginfo structure. */ int i = sip->si_nsysarg; while (--i >= 0) suword32_noerr(&(sip_addr->si_sysarg[i]), (uint32_t)lwp->lwp_arg[i]); copyout_noerr(curthread->t_rprof->rp_state, sip_addr->si_mstate, sizeof (curthread->t_rprof->rp_state)); } } else sip_addr = NULL; /* save the current context on the user stack */ fp -= SA(sizeof (*tuc)); uc = (ucontext_t *)fp; tuc = kmem_alloc(sizeof (*tuc), KM_SLEEP); savecontext(tuc, &lwp->lwp_sigoldmask); copyout_noerr(tuc, uc, sizeof (*tuc)); kmem_free(tuc, sizeof (*tuc)); tuc = NULL; lwp->lwp_oldcontext = (uintptr_t)uc; if (newstack) { lwp->lwp_sigaltstack.ss_flags |= SS_ONSTACK; if (lwp->lwp_ustack) copyout_noerr(&lwp->lwp_sigaltstack, (stack_t *)lwp->lwp_ustack, sizeof (stack_t)); } /* * Set up signal handler arguments */ { struct sigframe frame; frame.sip = sip_addr; frame.ucp = uc; frame.signo = sig; frame.retaddr = (void (*)())0xffffffff; /* never return! */ copyout_noerr(&frame, sp, sizeof (frame)); } no_fault(); if (watched) watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE); rp->r_sp = (greg_t)sp; rp->r_pc = (greg_t)hdlr; rp->r_ps = PSL_USER | (rp->r_ps & PS_IOPL); if ((rp->r_cs & 0xffff) != UCS_SEL || (rp->r_ss & 0xffff) != UDS_SEL) { rp->r_cs = UCS_SEL; rp->r_ss = UDS_SEL; } /* * Don't set lwp_eosys here. sendsig() is called via psig() after * lwp_eosys is handled, so setting it here would affect the next * system call. */ return (1); badstack: no_fault(); if (watched) watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE); if (tuc) kmem_free(tuc, sizeof (*tuc)); #ifdef DEBUG printf("sendsig: bad signal stack cmd=%s, pid=%d, sig=%d\n", PTOU(p)->u_comm, p->p_pid, sig); printf("on fault, sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n", (void *)sp, (void *)hdlr, (uintptr_t)upc); #endif return (0); }
/* * Return value: * 1 - exitlwps() failed, call (or continue) lwp_exit() * 0 - restarting init. Return through system call path */ int proc_exit(int why, int what) { kthread_t *t = curthread; klwp_t *lwp = ttolwp(t); proc_t *p = ttoproc(t); zone_t *z = p->p_zone; timeout_id_t tmp_id; int rv; proc_t *q; task_t *tk; vnode_t *exec_vp, *execdir_vp, *cdir, *rdir; sigqueue_t *sqp; lwpdir_t *lwpdir; uint_t lwpdir_sz; tidhash_t *tidhash; uint_t tidhash_sz; ret_tidhash_t *ret_tidhash; refstr_t *cwd; hrtime_t hrutime, hrstime; int evaporate; /* * Stop and discard the process's lwps except for the current one, * unless some other lwp beat us to it. If exitlwps() fails then * return and the calling lwp will call (or continue in) lwp_exit(). */ proc_is_exiting(p); if (exitlwps(0) != 0) return (1); mutex_enter(&p->p_lock); if (p->p_ttime > 0) { /* * Account any remaining ticks charged to this process * on its way out. */ (void) task_cpu_time_incr(p->p_task, p->p_ttime); p->p_ttime = 0; } mutex_exit(&p->p_lock); DTRACE_PROC(lwp__exit); DTRACE_PROC1(exit, int, why); /* * Will perform any brand specific proc exit processing, since this * is always the last lwp, will also perform lwp_exit and free brand * data */ if (PROC_IS_BRANDED(p)) { lwp_detach_brand_hdlrs(lwp); brand_clearbrand(p, B_FALSE); } /* * Don't let init exit unless zone_start_init() failed its exec, or * we are shutting down the zone or the machine. * * Since we are single threaded, we don't need to lock the * following accesses to zone_proc_initpid. */ if (p->p_pid == z->zone_proc_initpid) { if (z->zone_boot_err == 0 && zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN && z->zone_restart_init == B_TRUE && restart_init(what, why) == 0) return (0); /* * Since we didn't or couldn't restart init, we clear * the zone's init state and proceed with exit * processing. */ z->zone_proc_initpid = -1; } lwp_pcb_exit(); /* * Allocate a sigqueue now, before we grab locks. * It will be given to sigcld(), below. * Special case: If we will be making the process disappear * without a trace because it is either: * * an exiting SSYS process, or * * a posix_spawn() vfork child who requests it, * we don't bother to allocate a useless sigqueue. */ evaporate = (p->p_flag & SSYS) || ((p->p_flag & SVFORK) && why == CLD_EXITED && what == _EVAPORATE); if (!evaporate) sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); /* * revoke any doors created by the process. */ if (p->p_door_list) door_exit(); /* * Release schedctl data structures. */ if (p->p_pagep) schedctl_proc_cleanup(); /* * make sure all pending kaio has completed. */ if (p->p_aio) aio_cleanup_exit(); /* * discard the lwpchan cache. */ if (p->p_lcp != NULL) lwpchan_destroy_cache(0); /* * Clean up any DTrace helper actions or probes for the process. */ if (p->p_dtrace_helpers != NULL) { ASSERT(dtrace_helpers_cleanup != NULL); (*dtrace_helpers_cleanup)(); } /* untimeout the realtime timers */ if (p->p_itimer != NULL) timer_exit(); if ((tmp_id = p->p_alarmid) != 0) { p->p_alarmid = 0; (void) untimeout(tmp_id); } /* * Remove any fpollinfo_t's for this (last) thread from our file * descriptors so closeall() can ASSERT() that they're all gone. */ pollcleanup(); if (p->p_rprof_cyclic != CYCLIC_NONE) { mutex_enter(&cpu_lock); cyclic_remove(p->p_rprof_cyclic); mutex_exit(&cpu_lock); } mutex_enter(&p->p_lock); /* * Clean up any DTrace probes associated with this process. */ if (p->p_dtrace_probes) { ASSERT(dtrace_fasttrap_exit_ptr != NULL); dtrace_fasttrap_exit_ptr(p); } while ((tmp_id = p->p_itimerid) != 0) { p->p_itimerid = 0; mutex_exit(&p->p_lock); (void) untimeout(tmp_id); mutex_enter(&p->p_lock); } lwp_cleanup(); /* * We are about to exit; prevent our resource associations from * being changed. */ pool_barrier_enter(); /* * Block the process against /proc now that we have really * acquired p->p_lock (to manipulate p_tlist at least). */ prbarrier(p); sigfillset(&p->p_ignore); sigemptyset(&p->p_siginfo); sigemptyset(&p->p_sig); sigemptyset(&p->p_extsig); sigemptyset(&t->t_sig); sigemptyset(&t->t_extsig); sigemptyset(&p->p_sigmask); sigdelq(p, t, 0); lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; p->p_flag &= ~(SKILLED | SEXTKILLED); if (lwp->lwp_curinfo) { siginfofree(lwp->lwp_curinfo); lwp->lwp_curinfo = NULL; } t->t_proc_flag |= TP_LWPEXIT; ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0); prlwpexit(t); /* notify /proc */ lwp_hash_out(p, t->t_tid); prexit(p); p->p_lwpcnt = 0; p->p_tlist = NULL; sigqfree(p); term_mstate(t); p->p_mterm = gethrtime(); exec_vp = p->p_exec; execdir_vp = p->p_execdir; p->p_exec = NULLVP; p->p_execdir = NULLVP; mutex_exit(&p->p_lock); pr_free_watched_pages(p); closeall(P_FINFO(p)); /* Free the controlling tty. (freectty() always assumes curproc.) */ ASSERT(p == curproc); (void) freectty(B_TRUE); #if defined(__sparc) if (p->p_utraps != NULL) utrap_free(p); #endif if (p->p_semacct) /* IPC semaphore exit */ semexit(p); rv = wstat(why, what); acct(rv & 0xff); exacct_commit_proc(p, rv); /* * Release any resources associated with C2 auditing */ if (AU_AUDITING()) { /* * audit exit system call */ audit_exit(why, what); } /* * Free address space. */ relvm(); if (exec_vp) { /* * Close this executable which has been opened when the process * was created by getproc(). */ (void) VOP_CLOSE(exec_vp, FREAD, 1, (offset_t)0, CRED(), NULL); VN_RELE(exec_vp); } if (execdir_vp) VN_RELE(execdir_vp); /* * Release held contracts. */ contract_exit(p); /* * Depart our encapsulating process contract. */ if ((p->p_flag & SSYS) == 0) { ASSERT(p->p_ct_process); contract_process_exit(p->p_ct_process, p, rv); } /* * Remove pool association, and block if requested by pool_do_bind. */ mutex_enter(&p->p_lock); ASSERT(p->p_pool->pool_ref > 0); atomic_add_32(&p->p_pool->pool_ref, -1); p->p_pool = pool_default; /* * Now that our address space has been freed and all other threads * in this process have exited, set the PEXITED pool flag. This * tells the pools subsystems to ignore this process if it was * requested to rebind this process to a new pool. */ p->p_poolflag |= PEXITED; pool_barrier_exit(); mutex_exit(&p->p_lock); mutex_enter(&pidlock); /* * Delete this process from the newstate list of its parent. We * will put it in the right place in the sigcld in the end. */ delete_ns(p->p_parent, p); /* * Reassign the orphans to the next of kin. * Don't rearrange init's orphanage. */ if ((q = p->p_orphan) != NULL && p != proc_init) { proc_t *nokp = p->p_nextofkin; for (;;) { q->p_nextofkin = nokp; if (q->p_nextorph == NULL) break; q = q->p_nextorph; } q->p_nextorph = nokp->p_orphan; nokp->p_orphan = p->p_orphan; p->p_orphan = NULL; } /* * Reassign the children to init. * Don't try to assign init's children to init. */ if ((q = p->p_child) != NULL && p != proc_init) { struct proc *np; struct proc *initp = proc_init; boolean_t setzonetop = B_FALSE; if (!INGLOBALZONE(curproc)) setzonetop = B_TRUE; pgdetach(p); do { np = q->p_sibling; /* * Delete it from its current parent new state * list and add it to init new state list */ delete_ns(q->p_parent, q); q->p_ppid = 1; q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID); if (setzonetop) { mutex_enter(&q->p_lock); q->p_flag |= SZONETOP; mutex_exit(&q->p_lock); } q->p_parent = initp; /* * Since q will be the first child, * it will not have a previous sibling. */ q->p_psibling = NULL; if (initp->p_child) { initp->p_child->p_psibling = q; } q->p_sibling = initp->p_child; initp->p_child = q; if (q->p_proc_flag & P_PR_PTRACE) { mutex_enter(&q->p_lock); sigtoproc(q, NULL, SIGKILL); mutex_exit(&q->p_lock); } /* * sigcld() will add the child to parents * newstate list. */ if (q->p_stat == SZOMB) sigcld(q, NULL); } while ((q = np) != NULL); p->p_child = NULL; ASSERT(p->p_child_ns == NULL); } TRACE_1(TR_FAC_PROC, TR_PROC_EXIT, "proc_exit: %p", p); mutex_enter(&p->p_lock); CL_EXIT(curthread); /* tell the scheduler that curthread is exiting */ /* * Have our task accummulate our resource usage data before they * become contaminated by p_cacct etc., and before we renounce * membership of the task. * * We do this regardless of whether or not task accounting is active. * This is to avoid having nonsense data reported for this task if * task accounting is subsequently enabled. The overhead is minimal; * by this point, this process has accounted for the usage of all its * LWPs. We nonetheless do the work here, and under the protection of * pidlock, so that the movement of the process's usage to the task * happens at the same time as the removal of the process from the * task, from the point of view of exacct_snapshot_task_usage(). */ exacct_update_task_mstate(p); hrutime = mstate_aggr_state(p, LMS_USER); hrstime = mstate_aggr_state(p, LMS_SYSTEM); p->p_utime = (clock_t)NSEC_TO_TICK(hrutime) + p->p_cutime; p->p_stime = (clock_t)NSEC_TO_TICK(hrstime) + p->p_cstime; p->p_acct[LMS_USER] += p->p_cacct[LMS_USER]; p->p_acct[LMS_SYSTEM] += p->p_cacct[LMS_SYSTEM]; p->p_acct[LMS_TRAP] += p->p_cacct[LMS_TRAP]; p->p_acct[LMS_TFAULT] += p->p_cacct[LMS_TFAULT]; p->p_acct[LMS_DFAULT] += p->p_cacct[LMS_DFAULT]; p->p_acct[LMS_KFAULT] += p->p_cacct[LMS_KFAULT]; p->p_acct[LMS_USER_LOCK] += p->p_cacct[LMS_USER_LOCK]; p->p_acct[LMS_SLEEP] += p->p_cacct[LMS_SLEEP]; p->p_acct[LMS_WAIT_CPU] += p->p_cacct[LMS_WAIT_CPU]; p->p_acct[LMS_STOPPED] += p->p_cacct[LMS_STOPPED]; p->p_ru.minflt += p->p_cru.minflt; p->p_ru.majflt += p->p_cru.majflt; p->p_ru.nswap += p->p_cru.nswap; p->p_ru.inblock += p->p_cru.inblock; p->p_ru.oublock += p->p_cru.oublock; p->p_ru.msgsnd += p->p_cru.msgsnd; p->p_ru.msgrcv += p->p_cru.msgrcv; p->p_ru.nsignals += p->p_cru.nsignals; p->p_ru.nvcsw += p->p_cru.nvcsw; p->p_ru.nivcsw += p->p_cru.nivcsw; p->p_ru.sysc += p->p_cru.sysc; p->p_ru.ioch += p->p_cru.ioch; p->p_stat = SZOMB; p->p_proc_flag &= ~P_PR_PTRACE; p->p_wdata = what; p->p_wcode = (char)why; cdir = PTOU(p)->u_cdir; rdir = PTOU(p)->u_rdir; cwd = PTOU(p)->u_cwd; ASSERT(cdir != NULL || p->p_parent == &p0); /* * Release resource controls, as they are no longer enforceable. */ rctl_set_free(p->p_rctls); /* * Decrement tk_nlwps counter for our task.max-lwps resource control. * An extended accounting record, if that facility is active, is * scheduled to be written. We cannot give up task and project * membership at this point because that would allow zombies to escape * from the max-processes resource controls. Zombies stay in their * current task and project until the process table slot is released * in freeproc(). */ tk = p->p_task; mutex_enter(&p->p_zone->zone_nlwps_lock); tk->tk_nlwps--; tk->tk_proj->kpj_nlwps--; p->p_zone->zone_nlwps--; mutex_exit(&p->p_zone->zone_nlwps_lock); /* * Clear the lwp directory and the lwpid hash table * now that /proc can't bother us any more. * We free the memory below, after dropping p->p_lock. */ lwpdir = p->p_lwpdir; lwpdir_sz = p->p_lwpdir_sz; tidhash = p->p_tidhash; tidhash_sz = p->p_tidhash_sz; ret_tidhash = p->p_ret_tidhash; p->p_lwpdir = NULL; p->p_lwpfree = NULL; p->p_lwpdir_sz = 0; p->p_tidhash = NULL; p->p_tidhash_sz = 0; p->p_ret_tidhash = NULL; /* * If the process has context ops installed, call the exit routine * on behalf of this last remaining thread. Normally exitpctx() is * called during thread_exit() or lwp_exit(), but because this is the * last thread in the process, we must call it here. By the time * thread_exit() is called (below), the association with the relevant * process has been lost. * * We also free the context here. */ if (p->p_pctx) { kpreempt_disable(); exitpctx(p); kpreempt_enable(); freepctx(p, 0); } /* * curthread's proc pointer is changed to point to the 'sched' * process for the corresponding zone, except in the case when * the exiting process is in fact a zsched instance, in which * case the proc pointer is set to p0. We do so, so that the * process still points at the right zone when we call the VN_RELE() * below. * * This is because curthread's original proc pointer can be freed as * soon as the child sends a SIGCLD to its parent. We use zsched so * that for user processes, even in the final moments of death, the * process is still associated with its zone. */ if (p != t->t_procp->p_zone->zone_zsched) t->t_procp = t->t_procp->p_zone->zone_zsched; else t->t_procp = &p0; mutex_exit(&p->p_lock); if (!evaporate) { p->p_pidflag &= ~CLDPEND; sigcld(p, sqp); } else { /* * Do what sigcld() would do if the disposition * of the SIGCHLD signal were set to be ignored. */ cv_broadcast(&p->p_srwchan_cv); freeproc(p); } mutex_exit(&pidlock); /* * We don't release u_cdir and u_rdir until SZOMB is set. * This protects us against dofusers(). */ if (cdir) VN_RELE(cdir); if (rdir) VN_RELE(rdir); if (cwd) refstr_rele(cwd); /* * task_rele() may ultimately cause the zone to go away (or * may cause the last user process in a zone to go away, which * signals zsched to go away). So prior to this call, we must * no longer point at zsched. */ t->t_procp = &p0; kmem_free(lwpdir, lwpdir_sz * sizeof (lwpdir_t)); kmem_free(tidhash, tidhash_sz * sizeof (tidhash_t)); while (ret_tidhash != NULL) { ret_tidhash_t *next = ret_tidhash->rth_next; kmem_free(ret_tidhash->rth_tidhash, ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t)); kmem_free(ret_tidhash, sizeof (*ret_tidhash)); ret_tidhash = next; } thread_exit(); /* NOTREACHED */ }
/* * Allocate a new lxproc node * * This also allocates the vnode associated with it */ lxpr_node_t * lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd) { lxpr_node_t *lxpnp; vnode_t *vp; user_t *up; timestruc_t now; /* * Allocate a new node. It is deallocated in vop_innactive */ lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP); /* * Set defaults (may be overridden below) */ gethrestime(&now); lxpnp->lxpr_type = type; lxpnp->lxpr_realvp = NULL; lxpnp->lxpr_parent = dp; VN_HOLD(dp); if (p != NULL) { lxpnp->lxpr_pid = ((p->p_pid == curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid); lxpnp->lxpr_time = PTOU(p)->u_start; lxpnp->lxpr_uid = crgetruid(p->p_cred); lxpnp->lxpr_gid = crgetrgid(p->p_cred); lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd); } else { /* Pretend files without a proc belong to sched */ lxpnp->lxpr_pid = 0; lxpnp->lxpr_time = now; lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0; lxpnp->lxpr_ino = lxpr_inode(type, 0, 0); } /* initialize the vnode data */ vp = lxpnp->lxpr_vnode; vn_reinit(vp); vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; vp->v_vfsp = dp->v_vfsp; /* * Do node specific stuff */ switch (type) { case LXPR_PROCDIR: vp->v_flag |= VROOT; vp->v_type = VDIR; lxpnp->lxpr_mode = 0555; /* read-search by everyone */ break; case LXPR_PID_CURDIR: ASSERT(p != NULL); /* * Zombie check. p_stat is officially protected by pidlock, * but we can't grab pidlock here because we already hold * p_lock. Luckily if we look at the process exit code * we see that p_stat only transisions from SRUN to SZOMB * while p_lock is held. Aside from this, the only other * p_stat transition that we need to be aware about is * SIDL to SRUN, but that's not a problem since lxpr_lock() * ignores nodes in the SIDL state so we'll never get a node * that isn't already in the SRUN state. */ if (p->p_stat == SZOMB) { lxpnp->lxpr_realvp = NULL; } else { up = PTOU(p); lxpnp->lxpr_realvp = up->u_cdir; ASSERT(lxpnp->lxpr_realvp != NULL); VN_HOLD(lxpnp->lxpr_realvp); } vp->v_type = VLNK; lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ break; case LXPR_PID_ROOTDIR: ASSERT(p != NULL); /* Zombie check. see locking comment above */ if (p->p_stat == SZOMB) { lxpnp->lxpr_realvp = NULL; } else { up = PTOU(p); lxpnp->lxpr_realvp = up->u_rdir != NULL ? up->u_rdir : rootdir; ASSERT(lxpnp->lxpr_realvp != NULL); VN_HOLD(lxpnp->lxpr_realvp); } vp->v_type = VLNK; lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ break; case LXPR_PID_EXE: ASSERT(p != NULL); lxpnp->lxpr_realvp = p->p_exec; if (lxpnp->lxpr_realvp != NULL) { VN_HOLD(lxpnp->lxpr_realvp); } vp->v_type = VLNK; lxpnp->lxpr_mode = 0777; break; case LXPR_SELF: vp->v_type = VLNK; lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ break; case LXPR_PID_FD_FD: ASSERT(p != NULL); /* lxpr_realvp is set after we return */ vp->v_type = VLNK; lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */ break; case LXPR_PID_FDDIR: ASSERT(p != NULL); vp->v_type = VDIR; lxpnp->lxpr_mode = 0500; /* read-search by owner only */ break; case LXPR_PIDDIR: ASSERT(p != NULL); vp->v_type = VDIR; lxpnp->lxpr_mode = 0511; break; case LXPR_SYSDIR: case LXPR_SYS_FSDIR: case LXPR_SYS_FS_INOTIFYDIR: case LXPR_SYS_KERNELDIR: case LXPR_NETDIR: vp->v_type = VDIR; lxpnp->lxpr_mode = 0555; /* read-search by all */ break; case LXPR_PID_ENV: case LXPR_PID_MEM: ASSERT(p != NULL); /*FALLTHRU*/ case LXPR_KCORE: vp->v_type = VREG; lxpnp->lxpr_mode = 0400; /* read-only by owner only */ break; default: vp->v_type = VREG; lxpnp->lxpr_mode = 0444; /* read-only by all */ break; } return (lxpnp); }
void exacct_calculate_proc_usage(proc_t *p, proc_usage_t *pu, ulong_t *mask, int flag, int wstat) { timestruc_t ts, ts_run; ASSERT(MUTEX_HELD(&p->p_lock)); /* * Convert CPU and execution times to sec/nsec format. */ if (BT_TEST(mask, AC_PROC_CPU)) { hrt2ts(mstate_aggr_state(p, LMS_USER), &ts); pu->pu_utimesec = (uint64_t)(ulong_t)ts.tv_sec; pu->pu_utimensec = (uint64_t)(ulong_t)ts.tv_nsec; hrt2ts(mstate_aggr_state(p, LMS_SYSTEM), &ts); pu->pu_stimesec = (uint64_t)(ulong_t)ts.tv_sec; pu->pu_stimensec = (uint64_t)(ulong_t)ts.tv_nsec; } if (BT_TEST(mask, AC_PROC_TIME)) { gethrestime(&ts); pu->pu_finishsec = (uint64_t)(ulong_t)ts.tv_sec; pu->pu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec; hrt2ts(gethrtime() - p->p_mstart, &ts_run); ts.tv_sec -= ts_run.tv_sec; ts.tv_nsec -= ts_run.tv_nsec; if (ts.tv_nsec < 0) { ts.tv_sec--; if ((ts.tv_nsec = ts.tv_nsec + NANOSEC) >= NANOSEC) { ts.tv_sec++; ts.tv_nsec -= NANOSEC; } } pu->pu_startsec = (uint64_t)(ulong_t)ts.tv_sec; pu->pu_startnsec = (uint64_t)(ulong_t)ts.tv_nsec; } pu->pu_pid = p->p_pidp->pid_id; pu->pu_acflag = p->p_user.u_acflag; pu->pu_projid = p->p_task->tk_proj->kpj_id; pu->pu_taskid = p->p_task->tk_tkid; pu->pu_major = getmajor(p->p_sessp->s_dev); pu->pu_minor = getminor(p->p_sessp->s_dev); pu->pu_ancpid = p->p_ancpid; pu->pu_wstat = wstat; /* * Compute average RSS in K. The denominator is the number of * samples: the number of clock ticks plus the initial value. */ pu->pu_mem_rss_avg = (PTOU(p)->u_mem / (p->p_stime + p->p_utime + 1)) * (PAGESIZE / 1024); pu->pu_mem_rss_max = PTOU(p)->u_mem_max * (PAGESIZE / 1024); mutex_enter(&p->p_crlock); pu->pu_ruid = crgetruid(p->p_cred); pu->pu_rgid = crgetrgid(p->p_cred); mutex_exit(&p->p_crlock); bcopy(p->p_user.u_comm, pu->pu_command, strlen(p->p_user.u_comm) + 1); bcopy(p->p_zone->zone_name, pu->pu_zonename, strlen(p->p_zone->zone_name) + 1); bcopy(p->p_zone->zone_nodename, pu->pu_nodename, strlen(p->p_zone->zone_nodename) + 1); /* * Calculate microstate accounting data for a process that is still * running. Presently, we explicitly collect all of the LWP usage into * the proc usage structure here. */ if (flag & EW_PARTIAL) exacct_calculate_proc_mstate(p, pu); if (flag & EW_FINAL) exacct_copy_proc_mstate(p, pu); }
/* * Post-syscall processing. Perform abnormal system call completion * actions such as /proc tracing, profiling, signals, preemption, etc. * * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set. * Any condition requiring pre-syscall handling must set one of these. * If the condition is persistent, this routine will repost t_post_sys. */ void post_syscall(long rval1, long rval2) { kthread_t *t = curthread; klwp_t *lwp = ttolwp(t); proc_t *p = ttoproc(t); struct regs *rp = lwptoregs(lwp); uint_t error; uint_t code = t->t_sysnum; int repost = 0; int proc_stop = 0; /* non-zero if stopping */ int sigprof = 0; /* non-zero if sending SIGPROF */ t->t_post_sys = 0; error = lwp->lwp_errno; /* * Code can be zero if this is a new LWP returning after a forkall(), * other than the one which matches the one in the parent which called * forkall(). In these LWPs, skip most of post-syscall activity. */ if (code == 0) goto sig_check; /* * If the trace flag is set, mark the lwp to take a single-step trap * on return to user level (below). The x86 lcall interface and * sysenter has already done this, and turned off the flag, but * amd64 syscall interface has not. */ if (rp->r_ps & PS_T) { lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING; rp->r_ps &= ~PS_T; aston(curthread); } #ifdef C2_AUDIT if (audit_active) { /* put out audit record for this syscall */ rval_t rval; /* XX64 -- truncation of 64-bit return values? */ rval.r_val1 = (int)rval1; rval.r_val2 = (int)rval2; audit_finish(T_SYSCALL, code, error, &rval); repost = 1; } #endif /* C2_AUDIT */ if (curthread->t_pdmsg != NULL) { char *m = curthread->t_pdmsg; uprintf("%s", m); kmem_free(m, strlen(m) + 1); curthread->t_pdmsg = NULL; } /* * If we're going to stop for /proc tracing, set the flag and * save the arguments so that the return values don't smash them. */ if (PTOU(p)->u_systrap) { if (prismember(&PTOU(p)->u_exitmask, code)) { if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) (void) save_syscall_args(); proc_stop = 1; } repost = 1; } /* * Similarly check to see if SIGPROF might be sent. */ if (curthread->t_rprof != NULL && curthread->t_rprof->rp_anystate != 0) { if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) (void) save_syscall_args(); sigprof = 1; } if (lwp->lwp_eosys == NORMALRETURN) { if (error == 0) { #ifdef SYSCALLTRACE if (syscalltrace) { mutex_enter(&systrace_lock); printf( "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n", p->p_pid, rval1, rval2, curthread); mutex_exit(&systrace_lock); } #endif /* SYSCALLTRACE */ rp->r_ps &= ~PS_C; rp->r_r0 = rval1; rp->r_r1 = rval2; } else { int sig; #ifdef SYSCALLTRACE if (syscalltrace) { mutex_enter(&systrace_lock); printf("%d: error=%d, id 0x%p\n", p->p_pid, error, curthread); mutex_exit(&systrace_lock); } #endif /* SYSCALLTRACE */ if (error == EINTR && t->t_activefd.a_stale) error = EBADF; if (error == EINTR && (sig = lwp->lwp_cursig) != 0 && sigismember(&PTOU(p)->u_sigrestart, sig) && PTOU(p)->u_signal[sig - 1] != SIG_DFL && PTOU(p)->u_signal[sig - 1] != SIG_IGN) error = ERESTART; rp->r_r0 = error; rp->r_ps |= PS_C; } } /* * From the proc(4) manual page: * When exit from a system call is being traced, the traced process * stops on completion of the system call just prior to checking for * signals and returning to user level. At this point all return * values have been stored into the traced process's saved registers. */ if (proc_stop) { mutex_enter(&p->p_lock); if (PTOU(p)->u_systrap && prismember(&PTOU(p)->u_exitmask, code)) stop(PR_SYSEXIT, code); mutex_exit(&p->p_lock); } /* * If we are the parent returning from a successful * vfork, wait for the child to exec or exit. * This code must be here and not in the bowels of the system * so that /proc can intercept exit from vfork in a timely way. */ if (code == SYS_vfork && rp->r_r1 == 0 && error == 0) vfwait((pid_t)rval1); /* * If profiling is active, bill the current PC in user-land * and keep reposting until profiling is disabled. */ if (p->p_prof.pr_scale) { if (lwp->lwp_oweupc) profil_tick(rp->r_pc); repost = 1; } sig_check: /* * Reset flag for next time. * We must do this after stopping on PR_SYSEXIT * because /proc uses the information in lwp_eosys. */ lwp->lwp_eosys = NORMALRETURN; clear_stale_fd(); t->t_flag &= ~T_FORKALL; if (t->t_astflag | t->t_sig_check) { /* * Turn off the AST flag before checking all the conditions that * may have caused an AST. This flag is on whenever a signal or * unusual condition should be handled after the next trap or * syscall. */ astoff(t); /* * If a single-step trap occurred on a syscall (see trap()) * recognize it now. Do this before checking for signals * because deferred_singlestep_trap() may generate a SIGTRAP to * the LWP or may otherwise mark the LWP to call issig(FORREAL). */ if (lwp->lwp_pcb.pcb_flags & DEBUG_PENDING) deferred_singlestep_trap((caddr_t)rp->r_pc); t->t_sig_check = 0; /* * The following check is legal for the following reasons: * 1) The thread we are checking, is ourselves, so there is * no way the proc can go away. * 2) The only time we need to be protected by the * lock is if the binding is changed. * * Note we will still take the lock and check the binding * if the condition was true without the lock held. This * prevents lock contention among threads owned by the * same proc. */ if (curthread->t_proc_flag & TP_CHANGEBIND) { mutex_enter(&p->p_lock); if (curthread->t_proc_flag & TP_CHANGEBIND) { timer_lwpbind(); curthread->t_proc_flag &= ~TP_CHANGEBIND; } mutex_exit(&p->p_lock); } /* * for kaio requests on the special kaio poll queue, * copyout their results to user memory. */ if (p->p_aio) aio_cleanup(0); /* * If this LWP was asked to hold, call holdlwp(), which will * stop. holdlwps() sets this up and calls pokelwps() which * sets the AST flag. * * Also check TP_EXITLWP, since this is used by fresh new LWPs * through lwp_rtt(). That flag is set if the lwp_create(2) * syscall failed after creating the LWP. */ if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP)) holdlwp(); /* * All code that sets signals and makes ISSIG_PENDING * evaluate true must set t_sig_check afterwards. */ if (ISSIG_PENDING(t, lwp, p)) { if (issig(FORREAL)) psig(); t->t_sig_check = 1; /* recheck next time */ } if (sigprof) { realsigprof(code, error); t->t_sig_check = 1; /* recheck next time */ } /* * If a performance counter overflow interrupt was * delivered *during* the syscall, then re-enable the * AST so that we take a trip through trap() to cause * the SIGEMT to be delivered. */ if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW) aston(t); /* * /proc can't enable/disable the trace bit itself * because that could race with the call gate used by * system calls via "lcall". If that happened, an * invalid EFLAGS would result. prstep()/prnostep() * therefore schedule an AST for the purpose. */ if (lwp->lwp_pcb.pcb_flags & REQUEST_STEP) { lwp->lwp_pcb.pcb_flags &= ~REQUEST_STEP; rp->r_ps |= PS_T; } if (lwp->lwp_pcb.pcb_flags & REQUEST_NOSTEP) { lwp->lwp_pcb.pcb_flags &= ~REQUEST_NOSTEP; rp->r_ps &= ~PS_T; } } lwp->lwp_errno = 0; /* clear error for next time */ #ifndef NPROBE /* Kernel probe */ if (tnf_tracing_active) { TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */, tnf_long, rval1, rval1, tnf_long, rval2, rval2, tnf_long, errno, (long)error); repost = 1; } #endif /* NPROBE */ /* * Set state to LWP_USER here so preempt won't give us a kernel * priority if it occurs after this point. Call CL_TRAPRET() to * restore the user-level priority. * * It is important that no locks (other than spinlocks) be entered * after this point before returning to user mode (unless lwp_state * is set back to LWP_SYS). * * XXX Sampled times past this point are charged to the user. */ lwp->lwp_state = LWP_USER; if (t->t_trapret) { t->t_trapret = 0; thread_lock(t); CL_TRAPRET(t); thread_unlock(t); } if (CPU->cpu_runrun) preempt(); lwp->lwp_errno = 0; /* clear error for next time */ /* * The thread lock must be held in order to clear sysnum and reset * lwp_ap atomically with respect to other threads in the system that * may be looking at the args via lwp_ap from get_syscall_args(). */ thread_lock(t); t->t_sysnum = 0; /* no longer in a system call */ if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { #if defined(_LP64) /* * In case the args were copied to the lwp, reset the * pointer so the next syscall will have the right * lwp_ap pointer. */ lwp->lwp_ap = (long *)&rp->r_rdi; } else { #endif lwp->lwp_ap = NULL; /* reset on every syscall entry */ } thread_unlock(t); lwp->lwp_argsaved = 0; /* * If there was a continuing reason for post-syscall processing, * set the t_post_sys flag for the next system call. */ if (repost) t->t_post_sys = 1; /* * If there is a ustack registered for this lwp, and the stack rlimit * has been altered, read in the ustack. If the saved stack rlimit * matches the bounds of the ustack, update the ustack to reflect * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable * stack checking by setting the size to 0. */ if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) { rlim64_t new_size; caddr_t top; stack_t stk; struct rlimit64 rl; mutex_enter(&p->p_lock); new_size = p->p_stk_ctl; top = p->p_usrstack; (void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl); mutex_exit(&p->p_lock); if (rl.rlim_cur == RLIM64_INFINITY) new_size = 0; if (copyin((stack_t *)lwp->lwp_ustack, &stk, sizeof (stack_t)) == 0 && (stk.ss_size == lwp->lwp_old_stk_ctl || stk.ss_size == 0) && stk.ss_sp == top - stk.ss_size) { stk.ss_sp = (void *)((uintptr_t)stk.ss_sp + stk.ss_size - (uintptr_t)new_size); stk.ss_size = new_size; (void) copyout(&stk, (stack_t *)lwp->lwp_ustack, sizeof (stack_t)); } lwp->lwp_old_stk_ctl = 0; } }
/* * This routine assumes that the stack grows downward. * Returns 0 on success, errno on failure. */ int grow_internal(caddr_t sp, uint_t growszc) { struct proc *p = curproc; size_t newsize; size_t oldsize; int error; size_t pgsz; uint_t szc; struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); ASSERT(sp < p->p_usrstack); sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE); /* * grow to growszc alignment but use current p->p_stkpageszc for * the segvn_crargs szc passed to segvn_create. For memcntl to * increase the szc, this allows the new extension segment to be * concatenated successfully with the existing stack segment. */ if ((szc = growszc) != 0) { pgsz = page_get_pagesize(szc); ASSERT(pgsz > PAGESIZE); newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz); if (newsize > (size_t)p->p_stk_ctl) { szc = 0; pgsz = PAGESIZE; newsize = p->p_usrstack - sp; } } else { pgsz = PAGESIZE; newsize = p->p_usrstack - sp; } if (newsize > (size_t)p->p_stk_ctl) { (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p, RCA_UNSAFE_ALL); return (ENOMEM); } oldsize = p->p_stksize; ASSERT(P2PHASE(oldsize, PAGESIZE) == 0); if (newsize <= oldsize) { /* prevent the stack from shrinking */ return (0); } if (!(p->p_stkprot & PROT_EXEC)) { crargs.prot &= ~PROT_EXEC; } /* * extend stack with the proposed new growszc, which is different * than p_stkpageszc only on a memcntl to increase the stack pagesize. * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes * if not aligned to szc's pgsz. */ if (szc > 0) { caddr_t oldsp = p->p_usrstack - oldsize; caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz); if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) { crargs.szc = p->p_stkpageszc ? p->p_stkpageszc : AS_MAP_NO_LPOOB; } else if (oldsp == austk) { crargs.szc = szc; } else { crargs.szc = AS_MAP_STACK; } } else { crargs.szc = AS_MAP_NO_LPOOB; } crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN; if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize, segvn_create, &crargs)) != 0) { if (error == EAGAIN) { cmn_err(CE_WARN, "Sorry, no swap space to grow stack " "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm); } return (error); } p->p_stksize = newsize; return (0); }