static void ia32_osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct ia32_sigframe3 sf, *fp; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct ia32_sigframe3 *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(sf)); td->td_sigstk.ss_flags |= SS_ONSTACK; } else fp = (struct ia32_sigframe3 *)regs->tf_rsp - 1; /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_arg2 = (register_t)&fp->sf_siginfo; sf.sf_siginfo.si_signo = sig; sf.sf_siginfo.si_code = ksi->ksi_code; sf.sf_ah = (uintptr_t)catcher; } else { /* Old FreeBSD-style arguments. */ sf.sf_arg2 = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ah = (uintptr_t)catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Save most if not all of trap frame. */ sf.sf_siginfo.si_sc.sc_eax = regs->tf_rax; sf.sf_siginfo.si_sc.sc_ebx = regs->tf_rbx; sf.sf_siginfo.si_sc.sc_ecx = regs->tf_rcx; sf.sf_siginfo.si_sc.sc_edx = regs->tf_rdx; sf.sf_siginfo.si_sc.sc_esi = regs->tf_rsi; sf.sf_siginfo.si_sc.sc_edi = regs->tf_rdi; sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; sf.sf_siginfo.si_sc.sc_es = regs->tf_es; sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; sf.sf_siginfo.si_sc.sc_gs = regs->tf_gs; sf.sf_siginfo.si_sc.sc_isp = regs->tf_rsp; /* Build the signal context to be used by osigreturn(). */ sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); sf.sf_siginfo.si_sc.sc_esp = regs->tf_rsp; sf.sf_siginfo.si_sc.sc_ebp = regs->tf_rbp; sf.sf_siginfo.si_sc.sc_eip = regs->tf_rip; sf.sf_siginfo.si_sc.sc_eflags = regs->tf_rflags; sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; sf.sf_siginfo.si_sc.sc_err = regs->tf_err; /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(*fp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (uintptr_t)fp; regs->tf_rip = p->p_sysent->sv_psstrings - sz_ia32_osigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); }
static int linux_common_open(struct thread *td, int dirfd, char *path, int l_flags, int mode) { cap_rights_t rights; struct proc *p = td->td_proc; struct file *fp; int fd; int bsd_flags, error; bsd_flags = 0; switch (l_flags & LINUX_O_ACCMODE) { case LINUX_O_WRONLY: bsd_flags |= O_WRONLY; break; case LINUX_O_RDWR: bsd_flags |= O_RDWR; break; default: bsd_flags |= O_RDONLY; } if (l_flags & LINUX_O_NDELAY) bsd_flags |= O_NONBLOCK; if (l_flags & LINUX_O_APPEND) bsd_flags |= O_APPEND; if (l_flags & LINUX_O_SYNC) bsd_flags |= O_FSYNC; if (l_flags & LINUX_O_NONBLOCK) bsd_flags |= O_NONBLOCK; if (l_flags & LINUX_FASYNC) bsd_flags |= O_ASYNC; if (l_flags & LINUX_O_CREAT) bsd_flags |= O_CREAT; if (l_flags & LINUX_O_TRUNC) bsd_flags |= O_TRUNC; if (l_flags & LINUX_O_EXCL) bsd_flags |= O_EXCL; if (l_flags & LINUX_O_NOCTTY) bsd_flags |= O_NOCTTY; if (l_flags & LINUX_O_DIRECT) bsd_flags |= O_DIRECT; if (l_flags & LINUX_O_NOFOLLOW) bsd_flags |= O_NOFOLLOW; if (l_flags & LINUX_O_DIRECTORY) bsd_flags |= O_DIRECTORY; /* XXX LINUX_O_NOATIME: unable to be easily implemented. */ error = kern_openat(td, dirfd, path, UIO_SYSSPACE, bsd_flags, mode); if (error != 0) goto done; if (bsd_flags & O_NOCTTY) goto done; /* * XXX In between kern_open() and fget(), another process * having the same filedesc could use that fd without * checking below. */ fd = td->td_retval[0]; if (fget(td, fd, cap_rights_init(&rights, CAP_IOCTL), &fp) == 0) { if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); goto done; } sx_slock(&proctree_lock); PROC_LOCK(p); if (SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); sx_sunlock(&proctree_lock); /* XXXPJD: Verify if TIOCSCTTY is allowed. */ (void) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td->td_ucred, td); } else { PROC_UNLOCK(p); sx_sunlock(&proctree_lock); } fdrop(fp, td); } done: #ifdef DEBUG if (ldebug(open)) printf(LMSG("open returns error %d"), error); #endif LFREEPATH(path); return (error); }
static int trap_pfault(struct trapframe *frame, int user) { vm_offset_t eva, va; struct thread *td; struct proc *p; vm_map_t map; vm_prot_t ftype; int rv; td = curthread; p = td->td_proc; if (frame->exc == EXC_ISI) { eva = frame->srr0; ftype = VM_PROT_READ | VM_PROT_EXECUTE; } else { eva = frame->cpu.booke.dear; if (frame->cpu.booke.esr & ESR_ST) ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; } if (user) { KASSERT(p->p_vmspace != NULL, ("trap_pfault: vmspace NULL")); map = &p->p_vmspace->vm_map; } else { if (eva < VM_MAXUSER_ADDRESS) { if (p->p_vmspace == NULL) return (SIGSEGV); map = &p->p_vmspace->vm_map; } else { map = kernel_map; } } va = trunc_page(eva); if (map != kernel_map) { /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* Fault in the user page: */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process locking or stacks in the * kernel. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); if (!user && handle_onfault(frame)) return (0); return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); }
void data_abort_handler(trapframe_t *tf) { struct vm_map *map; struct pcb *pcb; struct thread *td; u_int user, far, fsr; vm_prot_t ftype; void *onfault; vm_offset_t va; int error = 0; struct ksig ksig; struct proc *p; /* Grab FAR/FSR before enabling interrupts */ far = cpu_faultaddress(); fsr = cpu_faultstatus(); #if 0 printf("data abort: fault address=%p (from pc=%p lr=%p)\n", (void*)far, (void*)tf->tf_pc, (void*)tf->tf_svc_lr); #endif /* Update vmmeter statistics */ #if 0 vmexp.traps++; #endif td = curthread; p = td->td_proc; PCPU_INC(cnt.v_trap); /* Data abort came from user mode? */ user = TRAP_USERMODE(tf); if (user) { td->td_pticks = 0; td->td_frame = tf; if (td->td_ucred != td->td_proc->p_ucred) cred_update_thread(td); } /* Grab the current pcb */ pcb = td->td_pcb; /* Re-enable interrupts if they were enabled previously */ if (td->td_md.md_spinlock_count == 0) { if (__predict_true(tf->tf_spsr & I32_bit) == 0) enable_interrupts(I32_bit); if (__predict_true(tf->tf_spsr & F32_bit) == 0) enable_interrupts(F32_bit); } /* Invoke the appropriate handler, if necessary */ if (__predict_false(data_aborts[fsr & FAULT_TYPE_MASK].func != NULL)) { if ((data_aborts[fsr & FAULT_TYPE_MASK].func)(tf, fsr, far, td, &ksig)) { goto do_trapsignal; } goto out; } /* * At this point, we're dealing with one of the following data aborts: * * FAULT_TRANS_S - Translation -- Section * FAULT_TRANS_P - Translation -- Page * FAULT_DOMAIN_S - Domain -- Section * FAULT_DOMAIN_P - Domain -- Page * FAULT_PERM_S - Permission -- Section * FAULT_PERM_P - Permission -- Page * * These are the main virtual memory-related faults signalled by * the MMU. */ /* fusubailout is used by [fs]uswintr to avoid page faulting */ if (__predict_false(pcb->pcb_onfault == fusubailout)) { tf->tf_r0 = EFAULT; tf->tf_pc = (register_t)(intptr_t) pcb->pcb_onfault; return; } /* * Make sure the Program Counter is sane. We could fall foul of * someone executing Thumb code, in which case the PC might not * be word-aligned. This would cause a kernel alignment fault * further down if we have to decode the current instruction. * XXX: It would be nice to be able to support Thumb at some point. */ if (__predict_false((tf->tf_pc & 3) != 0)) { if (user) { /* * Give the user an illegal instruction signal. */ /* Deliver a SIGILL to the process */ ksig.signb = SIGILL; ksig.code = 0; goto do_trapsignal; } /* * The kernel never executes Thumb code. */ printf("\ndata_abort_fault: Misaligned Kernel-mode " "Program Counter\n"); dab_fatal(tf, fsr, far, td, &ksig); } /* See if the cpu state needs to be fixed up */ switch (data_abort_fixup(tf, fsr, far, td, &ksig)) { case ABORT_FIXUP_RETURN: return; case ABORT_FIXUP_FAILED: /* Deliver a SIGILL to the process */ ksig.signb = SIGILL; ksig.code = 0; goto do_trapsignal; default: break; } va = trunc_page((vm_offset_t)far); /* * It is only a kernel address space fault iff: * 1. user == 0 and * 2. pcb_onfault not set or * 3. pcb_onfault set and not LDRT/LDRBT/STRT/STRBT instruction. */ if (user == 0 && (va >= VM_MIN_KERNEL_ADDRESS || (va < VM_MIN_ADDRESS && vector_page == ARM_VECTORS_LOW)) && __predict_true((pcb->pcb_onfault == NULL || (ReadWord(tf->tf_pc) & 0x05200000) != 0x04200000))) { map = kernel_map; /* Was the fault due to the FPE/IPKDB ? */ if (__predict_false((tf->tf_spsr & PSR_MODE)==PSR_UND32_MODE)) { /* * Force exit via userret() * This is necessary as the FPE is an extension to * userland that actually runs in a priveledged mode * but uses USR mode permissions for its accesses. */ user = 1; ksig.signb = SIGSEGV; ksig.code = 0; goto do_trapsignal; } } else { map = &td->td_proc->p_vmspace->vm_map; } /* * We need to know whether the page should be mapped * as R or R/W. The MMU does not give us the info as * to whether the fault was caused by a read or a write. * * However, we know that a permission fault can only be * the result of a write to a read-only location, so * we can deal with those quickly. * * Otherwise we need to disassemble the instruction * responsible to determine if it was a write. */ if (IS_PERMISSION_FAULT(fsr)) ftype = VM_PROT_WRITE; else { u_int insn = ReadWord(tf->tf_pc); if (((insn & 0x0c100000) == 0x04000000) || /* STR/STRB */ ((insn & 0x0e1000b0) == 0x000000b0) || /* STRH/STRD */ ((insn & 0x0a100000) == 0x08000000)) { /* STM/CDT */ ftype = VM_PROT_WRITE; } else { if ((insn & 0x0fb00ff0) == 0x01000090) /* SWP */ ftype = VM_PROT_READ | VM_PROT_WRITE; else ftype = VM_PROT_READ; } } /* * See if the fault is as a result of ref/mod emulation, * or domain mismatch. */ #ifdef DEBUG last_fault_code = fsr; #endif if (pmap_fault_fixup(vmspace_pmap(td->td_proc->p_vmspace), va, ftype, user)) { goto out; } onfault = pcb->pcb_onfault; pcb->pcb_onfault = NULL; if (map != kernel_map) { PROC_LOCK(p); p->p_lock++; PROC_UNLOCK(p); } error = vm_fault(map, va, ftype, VM_FAULT_NORMAL); pcb->pcb_onfault = onfault; if (map != kernel_map) { PROC_LOCK(p); p->p_lock--; PROC_UNLOCK(p); } if (__predict_true(error == 0)) goto out; if (user == 0) { if (pcb->pcb_onfault) { tf->tf_r0 = error; tf->tf_pc = (register_t)(intptr_t) pcb->pcb_onfault; return; } printf("\nvm_fault(%p, %x, %x, 0) -> %x\n", map, va, ftype, error); dab_fatal(tf, fsr, far, td, &ksig); } if (error == ENOMEM) { printf("VM: pid %d (%s), uid %d killed: " "out of swap\n", td->td_proc->p_pid, td->td_name, (td->td_proc->p_ucred) ? td->td_proc->p_ucred->cr_uid : -1); ksig.signb = SIGKILL; } else { ksig.signb = SIGSEGV; } ksig.code = 0; do_trapsignal: call_trapsignal(td, ksig.signb, ksig.code); out: /* If returning to user mode, make sure to invoke userret() */ if (user) userret(td, tf); }
static void pmclog_loop(void *arg) { int error; struct pmc_owner *po; struct pmclog_buffer *lb; struct proc *p; struct ucred *ownercred; struct ucred *mycred; struct thread *td; struct uio auio; struct iovec aiov; size_t nbytes; po = (struct pmc_owner *) arg; p = po->po_owner; td = curthread; mycred = td->td_ucred; PROC_LOCK(p); ownercred = crhold(p->p_ucred); PROC_UNLOCK(p); PMCDBG(LOG,INI,1, "po=%p kt=%p", po, po->po_kthread); KASSERT(po->po_kthread == curthread->td_proc, ("[pmclog,%d] proc mismatch po=%p po/kt=%p curproc=%p", __LINE__, po, po->po_kthread, curthread->td_proc)); lb = NULL; /* * Loop waiting for I/O requests to be added to the owner * struct's queue. The loop is exited when the log file * is deconfigured. */ mtx_lock(&pmc_kthread_mtx); for (;;) { /* check if we've been asked to exit */ if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) break; if (lb == NULL) { /* look for a fresh buffer to write */ mtx_lock_spin(&po->po_mtx); if ((lb = TAILQ_FIRST(&po->po_logbuffers)) == NULL) { mtx_unlock_spin(&po->po_mtx); /* No more buffers and shutdown required. */ if (po->po_flags & PMC_PO_SHUTDOWN) { mtx_unlock(&pmc_kthread_mtx); /* * Close the file to get PMCLOG_EOF * error in pmclog(3). */ fo_close(po->po_file, curthread); mtx_lock(&pmc_kthread_mtx); break; } (void) msleep(po, &pmc_kthread_mtx, PWAIT, "pmcloop", 0); continue; } TAILQ_REMOVE(&po->po_logbuffers, lb, plb_next); mtx_unlock_spin(&po->po_mtx); } mtx_unlock(&pmc_kthread_mtx); /* process the request */ PMCDBG(LOG,WRI,2, "po=%p base=%p ptr=%p", po, lb->plb_base, lb->plb_ptr); /* change our thread's credentials before issuing the I/O */ aiov.iov_base = lb->plb_base; aiov.iov_len = nbytes = lb->plb_ptr - lb->plb_base; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = -1; auio.uio_resid = nbytes; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; /* switch thread credentials -- see kern_ktrace.c */ td->td_ucred = ownercred; error = fo_write(po->po_file, &auio, ownercred, 0, td); td->td_ucred = mycred; if (error) { /* XXX some errors are recoverable */ /* send a SIGIO to the owner and exit */ PROC_LOCK(p); kern_psignal(p, SIGIO); PROC_UNLOCK(p); mtx_lock(&pmc_kthread_mtx); po->po_error = error; /* save for flush log */ PMCDBG(LOG,WRI,2, "po=%p error=%d", po, error); break; } mtx_lock(&pmc_kthread_mtx); /* put the used buffer back into the global pool */ PMCLOG_INIT_BUFFER_DESCRIPTOR(lb); mtx_lock_spin(&pmc_bufferlist_mtx); TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next); mtx_unlock_spin(&pmc_bufferlist_mtx); lb = NULL; } wakeup_one(po->po_kthread); po->po_kthread = NULL; mtx_unlock(&pmc_kthread_mtx); /* return the current I/O buffer to the global pool */ if (lb) { PMCLOG_INIT_BUFFER_DESCRIPTOR(lb); mtx_lock_spin(&pmc_bufferlist_mtx); TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next); mtx_unlock_spin(&pmc_bufferlist_mtx); } /* * Exit this thread, signalling the waiter */ crfree(ownercred); kproc_exit(0); }
/* * Exit: deallocate address space and other resources, change proc state to * zombie, and unlink proc from allproc and parent's lists. Save exit status * and rusage for wait(). Check for child processes and orphan them. */ void exit1(struct thread *td, int rv) { struct proc *p, *nq, *q; struct vnode *vtmp; struct vnode *ttyvp = NULL; struct plimit *plim; mtx_assert(&Giant, MA_NOTOWNED); p = td->td_proc; /* * XXX in case we're rebooting we just let init die in order to * work around an unsolved stack overflow seen very late during * shutdown on sparc64 when the gmirror worker process exists. */ if (p == initproc && rebooting == 0) { printf("init died (signal %d, exit %d)\n", WTERMSIG(rv), WEXITSTATUS(rv)); panic("Going nowhere without my init!"); } /* * MUST abort all other threads before proceeding past here. */ PROC_LOCK(p); while (p->p_flag & P_HADTHREADS) { /* * First check if some other thread got here before us. * If so, act appropriately: exit or suspend. */ thread_suspend_check(0); /* * Kill off the other threads. This requires * some co-operation from other parts of the kernel * so it may not be instantaneous. With this state set * any thread entering the kernel from userspace will * thread_exit() in trap(). Any thread attempting to * sleep will return immediately with EINTR or EWOULDBLOCK * which will hopefully force them to back out to userland * freeing resources as they go. Any thread attempting * to return to userland will thread_exit() from userret(). * thread_exit() will unsuspend us when the last of the * other threads exits. * If there is already a thread singler after resumption, * calling thread_single will fail; in that case, we just * re-check all suspension request, the thread should * either be suspended there or exit. */ if (!thread_single(SINGLE_EXIT)) break; /* * All other activity in this process is now stopped. * Threading support has been turned off. */ } KASSERT(p->p_numthreads == 1, ("exit1: proc %p exiting with %d threads", p, p->p_numthreads)); racct_sub(p, RACCT_NTHR, 1); /* * Wakeup anyone in procfs' PIOCWAIT. They should have a hold * on our vmspace, so we should block below until they have * released their reference to us. Note that if they have * requested S_EXIT stops we will block here until they ack * via PIOCCONT. */ _STOPEVENT(p, S_EXIT, rv); /* * Ignore any pending request to stop due to a stop signal. * Once P_WEXIT is set, future requests will be ignored as * well. */ p->p_flag &= ~P_STOPPED_SIG; KASSERT(!P_SHOULDSTOP(p), ("exiting process is stopped")); /* * Note that we are exiting and do another wakeup of anyone in * PIOCWAIT in case they aren't listening for S_EXIT stops or * decided to wait again after we told them we are exiting. */ p->p_flag |= P_WEXIT; wakeup(&p->p_stype); /* * Wait for any processes that have a hold on our vmspace to * release their reference. */ while (p->p_lock > 0) msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0); p->p_xstat = rv; /* Let event handler change exit status */ PROC_UNLOCK(p); /* Drain the limit callout while we don't have the proc locked */ callout_drain(&p->p_limco); #ifdef AUDIT /* * The Sun BSM exit token contains two components: an exit status as * passed to exit(), and a return value to indicate what sort of exit * it was. The exit status is WEXITSTATUS(rv), but it's not clear * what the return value is. */ AUDIT_ARG_EXIT(WEXITSTATUS(rv), 0); AUDIT_SYSCALL_EXIT(0, td); #endif /* Are we a task leader? */ if (p == p->p_leader) { mtx_lock(&ppeers_lock); q = p->p_peers; while (q != NULL) { PROC_LOCK(q); kern_psignal(q, SIGKILL); PROC_UNLOCK(q); q = q->p_peers; } while (p->p_peers != NULL) msleep(p, &ppeers_lock, PWAIT, "exit1", 0); mtx_unlock(&ppeers_lock); } /* * Check if any loadable modules need anything done at process exit. * E.g. SYSV IPC stuff * XXX what if one of these generates an error? */ EVENTHANDLER_INVOKE(process_exit, p); /* * If parent is waiting for us to exit or exec, * P_PPWAIT is set; we will wakeup the parent below. */ PROC_LOCK(p); rv = p->p_xstat; /* Event handler could change exit status */ stopprofclock(p); p->p_flag &= ~(P_TRACED | P_PPWAIT | P_PPTRACE); /* * Stop the real interval timer. If the handler is currently * executing, prevent it from rearming itself and let it finish. */ if (timevalisset(&p->p_realtimer.it_value) && callout_stop(&p->p_itcallout) == 0) { timevalclear(&p->p_realtimer.it_interval); msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0); KASSERT(!timevalisset(&p->p_realtimer.it_value), ("realtime timer is still armed")); } PROC_UNLOCK(p); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pid. */ funsetownlst(&p->p_sigiolst); /* * If this process has an nlminfo data area (for lockd), release it */ if (nlminfo_release_p != NULL && p->p_nlminfo != NULL) (*nlminfo_release_p)(p); /* * Close open files and release open-file table. * This may block! */ fdescfree(td); /* * If this thread tickled GEOM, we need to wait for the giggling to * stop before we return to userland */ if (td->td_pflags & TDP_GEOM) g_waitidle(); /* * Remove ourself from our leader's peer list and wake our leader. */ mtx_lock(&ppeers_lock); if (p->p_leader->p_peers) { q = p->p_leader; while (q->p_peers != p) q = q->p_peers; q->p_peers = p->p_peers; wakeup(p->p_leader); } mtx_unlock(&ppeers_lock); vmspace_exit(td); sx_xlock(&proctree_lock); if (SESS_LEADER(p)) { struct session *sp = p->p_session; struct tty *tp; /* * s_ttyp is not zero'd; we use this to indicate that * the session once had a controlling terminal. (for * logging and informational purposes) */ SESS_LOCK(sp); ttyvp = sp->s_ttyvp; tp = sp->s_ttyp; sp->s_ttyvp = NULL; sp->s_ttydp = NULL; sp->s_leader = NULL; SESS_UNLOCK(sp); /* * Signal foreground pgrp and revoke access to * controlling terminal if it has not been revoked * already. * * Because the TTY may have been revoked in the mean * time and could already have a new session associated * with it, make sure we don't send a SIGHUP to a * foreground process group that does not belong to this * session. */ if (tp != NULL) { tty_lock(tp); if (tp->t_session == sp) tty_signal_pgrp(tp, SIGHUP); tty_unlock(tp); } if (ttyvp != NULL) { sx_xunlock(&proctree_lock); if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) { VOP_REVOKE(ttyvp, REVOKEALL); VOP_UNLOCK(ttyvp, 0); } sx_xlock(&proctree_lock); } } fixjobc(p, p->p_pgrp, 0); sx_xunlock(&proctree_lock); (void)acct_process(td); /* Release the TTY now we've unlocked everything. */ if (ttyvp != NULL) vrele(ttyvp); #ifdef KTRACE ktrprocexit(td); #endif /* * Release reference to text vnode */ if ((vtmp = p->p_textvp) != NULL) { p->p_textvp = NULL; vrele(vtmp); } /* * Release our limits structure. */ plim = p->p_limit; p->p_limit = NULL; lim_free(plim); tidhash_remove(td); /* * Remove proc from allproc queue and pidhash chain. * Place onto zombproc. Unlink from parent's child list. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); LIST_INSERT_HEAD(&zombproc, p, p_list); LIST_REMOVE(p, p_hash); sx_xunlock(&allproc_lock); /* * Call machine-dependent code to release any * machine-dependent resources other than the address space. * The address space is released by "vmspace_exitfree(p)" in * vm_waitproc(). */ cpu_exit(td); WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid); /* * Reparent all of our children to init. */ sx_xlock(&proctree_lock); q = LIST_FIRST(&p->p_children); if (q != NULL) /* only need this if any child is S_ZOMB */ wakeup(initproc); for (; q != NULL; q = nq) { nq = LIST_NEXT(q, p_sibling); PROC_LOCK(q); proc_reparent(q, initproc); q->p_sigparent = SIGCHLD; /* * Traced processes are killed * since their existence means someone is screwing up. */ if (q->p_flag & P_TRACED) { struct thread *temp; /* * Since q was found on our children list, the * proc_reparent() call moved q to the orphan * list due to present P_TRACED flag. Clear * orphan link for q now while q is locked. */ clear_orphan(q); q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE); FOREACH_THREAD_IN_PROC(q, temp) temp->td_dbgflags &= ~TDB_SUSPEND; kern_psignal(q, SIGKILL); } PROC_UNLOCK(q); } /* * Also get rid of our orphans. */ while ((q = LIST_FIRST(&p->p_orphans)) != NULL) { PROC_LOCK(q); clear_orphan(q); PROC_UNLOCK(q); } /* Save exit status. */ PROC_LOCK(p); p->p_xthread = td; /* Tell the prison that we are gone. */ prison_proc_free(p->p_ucred->cr_prison); #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exit if it * has declared an interest. */ if (dtrace_fasttrap_exit) dtrace_fasttrap_exit(p); #endif /* * Notify interested parties of our demise. */ KNOTE_LOCKED(&p->p_klist, NOTE_EXIT); #ifdef KDTRACE_HOOKS int reason = CLD_EXITED; if (WCOREDUMP(rv)) reason = CLD_DUMPED; else if (WIFSIGNALED(rv)) reason = CLD_KILLED; SDT_PROBE(proc, kernel, , exit, reason, 0, 0, 0, 0); #endif /* * Just delete all entries in the p_klist. At this point we won't * report any more events, and there are nasty race conditions that * can beat us if we don't. */ knlist_clear(&p->p_klist, 1); /* * If this is a process with a descriptor, we may not need to deliver * a signal to the parent. proctree_lock is held over * procdesc_exit() to serialize concurrent calls to close() and * exit(). */ if (p->p_procdesc == NULL || procdesc_exit(p)) { /* * Notify parent that we're gone. If parent has the * PS_NOCLDWAIT flag set, or if the handler is set to SIG_IGN, * notify process 1 instead (and hope it will handle this * situation). */ PROC_LOCK(p->p_pptr); mtx_lock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) { struct proc *pp; mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); pp = p->p_pptr; PROC_UNLOCK(pp); proc_reparent(p, initproc); p->p_sigparent = SIGCHLD; PROC_LOCK(p->p_pptr); /* * Notify parent, so in case he was wait(2)ing or * executing waitpid(2) with our pid, he will * continue. */ wakeup(pp); } else mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr == initproc) kern_psignal(p->p_pptr, SIGCHLD); else if (p->p_sigparent != 0) { if (p->p_sigparent == SIGCHLD) childproc_exited(p); else /* LINUX thread */ kern_psignal(p->p_pptr, p->p_sigparent); } } else PROC_LOCK(p->p_pptr); sx_xunlock(&proctree_lock); /* * The state PRS_ZOMBIE prevents other proesses from sending * signal to the process, to avoid memory leak, we free memory * for signal queue at the time when the state is set. */ sigqueue_flush(&p->p_sigqueue); sigqueue_flush(&td->td_sigqueue); /* * We have to wait until after acquiring all locks before * changing p_state. We need to avoid all possible context * switches (including ones from blocking on a mutex) while * marked as a zombie. We also have to set the zombie state * before we release the parent process' proc lock to avoid * a lost wakeup. So, we first call wakeup, then we grab the * sched lock, update the state, and release the parent process' * proc lock. */ wakeup(p->p_pptr); cv_broadcast(&p->p_pwait); sched_exit(p->p_pptr, td); PROC_SLOCK(p); p->p_state = PRS_ZOMBIE; PROC_UNLOCK(p->p_pptr); /* * Hopefully no one will try to deliver a signal to the process this * late in the game. */ knlist_destroy(&p->p_klist); /* * Save our children's rusage information in our exit rusage. */ ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux); /* * Make sure the scheduler takes this thread out of its tables etc. * This will also release this thread's reference to the ucred. * Other thread parts to release include pcb bits and such. */ thread_exit(); }
/* * Copied from amd64/amd64/machdep.c * * XXX fpu state need? don't think so */ int linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) { struct proc *p; struct l_ucontext uc; struct l_sigcontext *context; struct trapframe *regs; unsigned long rflags; int error; ksiginfo_t ksi; regs = td->td_frame; error = copyin((void *)regs->tf_rbx, &uc, sizeof(uc)); if (error != 0) return (error); p = td->td_proc; context = &uc.uc_mcontext; rflags = context->sc_rflags; /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_rflags for faults. Debuggers * should sometimes set it there too. tf_rflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ #define RFLAG_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) if (!RFLAG_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { printf("linux_rt_sigreturn: rflags = 0x%lx\n", rflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) if (!CS_SECURE(context->sc_cs)) { printf("linux_rt_sigreturn: cs = 0x%x\n", context->sc_cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } PROC_LOCK(p); linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask); SIG_CANTMASK(td->td_sigmask); signotify(td); PROC_UNLOCK(p); regs->tf_rdi = context->sc_rdi; regs->tf_rsi = context->sc_rsi; regs->tf_rdx = context->sc_rdx; regs->tf_rbp = context->sc_rbp; regs->tf_rbx = context->sc_rbx; regs->tf_rcx = context->sc_rcx; regs->tf_rax = context->sc_rax; regs->tf_rip = context->sc_rip; regs->tf_rsp = context->sc_rsp; regs->tf_r8 = context->sc_r8; regs->tf_r9 = context->sc_r9; regs->tf_r10 = context->sc_r10; regs->tf_r11 = context->sc_r11; regs->tf_r12 = context->sc_r12; regs->tf_r13 = context->sc_r13; regs->tf_r14 = context->sc_r14; regs->tf_r15 = context->sc_r15; regs->tf_cs = context->sc_cs; regs->tf_err = context->sc_err; regs->tf_rflags = rflags; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (EJUSTRETURN); }
static int procfs_control(struct thread *td, struct proc *p, int op) { int error = 0; struct thread *temp; /* * Attach - attaches the target process for debugging * by the calling process. */ if (op == PROCFS_CTL_ATTACH) { sx_xlock(&proctree_lock); PROC_LOCK(p); if ((error = p_candebug(td, p)) != 0) goto out; if (p->p_flag & P_TRACED) { error = EBUSY; goto out; } /* Can't trace yourself! */ if (p->p_pid == td->td_proc->p_pid) { error = EINVAL; goto out; } /* * Go ahead and set the trace flag. * Save the old parent (it's reset in * _DETACH, and also in kern_exit.c:wait4() * Reparent the process so that the tracing * proc gets to see all the action. * Stop the target. */ p->p_flag |= P_TRACED; faultin(p); p->p_xstat = 0; /* XXX ? */ if (p->p_pptr != td->td_proc) { p->p_oppid = p->p_pptr->p_pid; proc_reparent(p, td->td_proc); } psignal(p, SIGSTOP); out: PROC_UNLOCK(p); sx_xunlock(&proctree_lock); return (error); } /* * Authorization check: rely on normal debugging protection, except * allow processes to disengage debugging on a process onto which * they have previously attached, but no longer have permission to * debug. */ PROC_LOCK(p); if (op != PROCFS_CTL_DETACH && ((error = p_candebug(td, p)))) { PROC_UNLOCK(p); return (error); } /* * Target process must be stopped, owned by (td) and * be set up for tracing (P_TRACED flag set). * Allow DETACH to take place at any time for sanity. * Allow WAIT any time, of course. */ switch (op) { case PROCFS_CTL_DETACH: case PROCFS_CTL_WAIT: break; default: if (!TRACE_WAIT_P(td->td_proc, p)) { PROC_UNLOCK(p); return (EBUSY); } } #ifdef FIX_SSTEP /* * do single-step fixup if needed */ FIX_SSTEP(FIRST_THREAD_IN_PROC(p)); #endif /* * Don't deliver any signal by default. * To continue with a signal, just send * the signal name to the ctl file */ p->p_xstat = 0; switch (op) { /* * Detach. Cleans up the target process, reparent it if possible * and set it running once more. */ case PROCFS_CTL_DETACH: /* if not being traced, then this is a painless no-op */ if ((p->p_flag & P_TRACED) == 0) { PROC_UNLOCK(p); return (0); } /* not being traced any more */ p->p_flag &= ~(P_TRACED | P_STOPPED_TRACE); /* remove pending SIGTRAP, else the process will die */ sigqueue_delete_proc(p, SIGTRAP); FOREACH_THREAD_IN_PROC(p, temp) temp->td_dbgflags &= ~TDB_SUSPEND; PROC_UNLOCK(p); /* give process back to original parent */ sx_xlock(&proctree_lock); if (p->p_oppid != p->p_pptr->p_pid) { struct proc *pp; pp = pfind(p->p_oppid); PROC_LOCK(p); if (pp) { PROC_UNLOCK(pp); proc_reparent(p, pp); } } else PROC_LOCK(p); p->p_oppid = 0; p->p_flag &= ~P_WAITED; /* XXX ? */ sx_xunlock(&proctree_lock); wakeup(td->td_proc); /* XXX for CTL_WAIT below ? */ break; /* * Step. Let the target process execute a single instruction. * What does it mean to single step a threaded program? */ case PROCFS_CTL_STEP: error = proc_sstep(FIRST_THREAD_IN_PROC(p)); if (error) { PROC_UNLOCK(p); return (error); } break; /* * Run. Let the target process continue running until a breakpoint * or some other trap. */ case PROCFS_CTL_RUN: p->p_flag &= ~P_STOPPED_SIG; /* this uses SIGSTOP */ break; /* * Wait for the target process to stop. * If the target is not being traced then just wait * to enter */ case PROCFS_CTL_WAIT: if (p->p_flag & P_TRACED) { while (error == 0 && (P_SHOULDSTOP(p)) && (p->p_flag & P_TRACED) && (p->p_pptr == td->td_proc)) error = msleep(p, &p->p_mtx, PWAIT|PCATCH, "procfsx", 0); if (error == 0 && !TRACE_WAIT_P(td->td_proc, p)) error = EBUSY; } else { while (error == 0 && P_SHOULDSTOP(p)) error = msleep(p, &p->p_mtx, PWAIT|PCATCH, "procfs", 0); } PROC_UNLOCK(p); return (error); default: panic("procfs_control"); } PROC_SLOCK(p); thread_unsuspend(p); /* If it can run, let it do so. */ PROC_SUNLOCK(p); PROC_UNLOCK(p); return (0); }
/* * The CheriABI version of sendsig(9) largely borrows from the MIPS version, * and it is important to keep them in sync. It differs primarily in that it * must also be aware of user stack-handling ABIs, so is also sensitive to our * (fluctuating) design choices in how $stc and $sp interact. The current * design uses ($stc + $sp) for stack-relative references, so early on we have * to calculate a 'relocated' version of $sp that we can then use for * MIPS-style access. * * This code, as with the CHERI-aware MIPS code, makes a privilege * determination in order to decide whether to trust the stack exposed by the * user code for the purposes of signal handling. We must use the alternative * stack if there is any indication that using the user thread's stack state * might violate the userspace compartmentalisation model. */ static void cheriabi_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct proc *p; struct thread *td; struct trapframe *regs; struct sigacts *psp; struct sigframe_c sf, *sfp; uintptr_t stackbase; vm_offset_t sp; int cheri_is_sandboxed; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; /* * In CheriABI, $sp is $stc relative, so calculate a relocation base * that must be combined with regs->sp from this point onwards. * Unfortunately, we won't retain bounds and permissions information * (as is the case elsewhere in CheriABI). While 'stackbase' * suggests that $stc's offset isn't included, in practice it will be, * although we may reasonably assume that it will be zero. * * If it turns out we will be delivering to the alternative signal * stack, we'll recalculate stackbase later. */ CHERI_CLC(CHERI_CR_CTEMP0, CHERI_CR_KDC, &td->td_pcb->pcb_regs.stc, 0); CHERI_CTOPTR(stackbase, CHERI_CR_CTEMP0, CHERI_CR_KDC); oonstack = sigonstack(stackbase + regs->sp); /* * CHERI affects signal delivery in the following ways: * * (1) Additional capability-coprocessor state is exposed via * extensions to the context frame placed on the stack. * * (2) If the user $pcc doesn't include CHERI_PERM_SYSCALL, then we * consider user state to be 'sandboxed' and therefore to require * special delivery handling which includes a domain-switch to the * thread's context-switch domain. (This is done by * cheri_sendsig()). * * (3) If an alternative signal stack is not defined, and we are in a * 'sandboxed' state, then we have two choices: (a) if the signal * is of type SA_SANDBOX_UNWIND, we will automatically unwind the * trusted stack by one frame; (b) otherwise, we will terminate * the process unconditionally. */ cheri_is_sandboxed = cheri_signal_sandboxed(td); /* * We provide the ability to drop into the debugger in two different * circumstances: (1) if the code running is sandboxed; and (2) if the * fault is a CHERI protection fault. Handle both here for the * non-unwind case. Do this before we rewrite any general-purpose or * capability register state for the thread. */ #if DDB if (cheri_is_sandboxed && security_cheri_debugger_on_sandbox_signal) kdb_enter(KDB_WHY_CHERI, "Signal delivery to CHERI sandbox"); else if (sig == SIGPROT && security_cheri_debugger_on_sigprot) kdb_enter(KDB_WHY_CHERI, "SIGPROT delivered outside sandbox"); #endif /* * If a thread is running sandboxed, we can't rely on $sp which may * not point at a valid stack in the ambient context, or even be * maliciously manipulated. We must therefore always use the * alternative stack. We are also therefore unable to tell whether we * are on the alternative stack, so must clear 'oonstack' here. * * XXXRW: This requires significant further thinking; however, the net * upshot is that it is not a good idea to do an object-capability * invoke() from a signal handler, as with so many other things in * life. */ if (cheri_is_sandboxed != 0) oonstack = 0; /* save user context */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; #if 0 /* * XXX-BD: stack_t type differs and we can't just fake a capabilty. * We don't restore the value so what purpose does it serve? */ sf.sf_uc.uc_stack = td->td_sigstk; #endif sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_pc = regs->pc; sf.sf_uc.uc_mcontext.mullo = regs->mullo; sf.sf_uc.uc_mcontext.mulhi = regs->mulhi; cheri_capability_copy(&sf.sf_uc.uc_mcontext.mc_tls, &td->td_md.md_tls_cap); sf.sf_uc.uc_mcontext.mc_regs[0] = UCONTEXT_MAGIC; /* magic number */ bcopy((void *)®s->ast, (void *)&sf.sf_uc.uc_mcontext.mc_regs[1], sizeof(sf.sf_uc.uc_mcontext.mc_regs) - sizeof(register_t)); sf.sf_uc.uc_mcontext.mc_fpused = td->td_md.md_flags & MDTD_FPUSED; #if defined(CPU_HAVEFPU) if (sf.sf_uc.uc_mcontext.mc_fpused) { /* if FPU has current state, save it first */ if (td == PCPU_GET(fpcurthread)) MipsSaveCurFPState(td); bcopy((void *)&td->td_frame->f0, (void *)sf.sf_uc.uc_mcontext.mc_fpregs, sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); } #endif /* XXXRW: sf.sf_uc.uc_mcontext.sr seems never to be set? */ sf.sf_uc.uc_mcontext.cause = regs->cause; cheri_trapframe_to_cheriframe(&td->td_pcb->pcb_regs, &sf.sf_uc.uc_mcontext.mc_cheriframe); /* * Allocate and validate space for the signal handler context. For * CheriABI purposes, 'sp' from this point forward is relocated * relative to any pertinent stack capability. For an alternative * signal context, we need to recalculate stackbase for later use in * calculating a new $sp for the signal-handling context. * * XXXRW: It seems like it would be nice to both the regular and * alternative stack calculations in the same place. However, we need * oonstack sooner. We should clean this up later. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { stackbase = (vm_offset_t)td->td_sigstk.ss_sp; sp = (vm_offset_t)(stackbase + td->td_sigstk.ss_size); } else { /* * Signals delivered when a CHERI sandbox is present must be * delivered on the alternative stack rather than a local one. * If an alternative stack isn't present, then terminate or * risk leaking capabilities (and control) to the sandbox (or * just crashing the sandbox). */ if (cheri_is_sandboxed) { mtx_unlock(&psp->ps_mtx); printf("pid %d, tid %d: signal in sandbox without " "alternative stack defined\n", td->td_proc->p_pid, td->td_tid); sigexit(td, SIGILL); /* NOTREACHED */ } sp = (vm_offset_t)(stackbase + regs->sp); } sp -= sizeof(struct sigframe_c); /* For CHERI, keep the stack pointer capability aligned. */ sp &= ~(CHERICAP_SIZE - 1); sfp = (void *)sp; /* Build the argument list for the signal handler. */ regs->a0 = sig; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* * Signal handler installed with SA_SIGINFO. * * XXXRW: We would ideally synthesise these from the * user-originated stack capability, rather than $kdc, to be * on the safe side. */ cheri_capability_set(®s->c3, CHERI_CAP_USER_DATA_PERMS, (void *)(intptr_t)&sfp->sf_si, sizeof(sfp->sf_si), 0); cheri_capability_set(®s->c4, CHERI_CAP_USER_DATA_PERMS, (void *)(intptr_t)&sfp->sf_uc, sizeof(sfp->sf_uc), 0); /* sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; */ /* fill siginfo structure */ sf.sf_si.si_signo = sig; sf.sf_si.si_code = ksi->ksi_code; /* * Write out badvaddr, but don't create a valid capability * since that might allow privilege amplification. * * XXX-BD: This probably isn't the right method. * XXX-BD: Do we want to set base or offset? * * XXXRW: I think there's some argument that anything * receiving this signal is fairly privileged. But we could * generate a $ddc-relative (or $pcc-relative) capability, if * possible. (Using versions if $ddc and $pcc for the * signal-handling context rather than that which caused the * signal). I'd be tempted to deliver badvaddr as the offset * of that capability. If badvaddr is not in range, then we * should just deliver an untagged NULL-derived version * (perhaps)? */ *((uintptr_t *)&sf.sf_si.si_addr) = (uintptr_t)(void *)regs->badvaddr; } /* * XXX: No support for undocumented arguments to old style handlers. */ mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Copy the sigframe out to the user's stack. */ if (copyoutcap(&sf, (void *)sfp, sizeof(sf)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ PROC_LOCK(p); printf("pid %d, tid %d: could not copy out sigframe\n", td->td_proc->p_pid, td->td_tid); sigexit(td, SIGILL); /* NOTREACHED */ } /* * Re-acquire process locks necessary to access suitable pcb fields. * However, arguably, these operations should be atomic with the * initial inspection of 'psp'. */ PROC_LOCK(p); mtx_lock(&psp->ps_mtx); /* * Install CHERI signal-delivery register state for handler to run * in. As we don't install this in the CHERI frame on the user stack, * it will be (generally) be removed automatically on sigreturn(). */ /* XXX-BD: this isn't quite right */ cheri_sendsig(td); /* * Note that $sp must be installed relative to $stc, so re-subtract * the stack base here. */ regs->pc = (register_t)(intptr_t)catcher; regs->sp = (register_t)((intptr_t)sfp - stackbase); cheri_capability_copy(®s->c12, &psp->ps_sigcap[_SIG_IDX(sig)]); cheri_capability_copy(®s->c17, &td->td_pcb->pcb_cherisignal.csig_sigcode); }
int ext2_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, int *runp, int *runb) { struct inode *ip; struct buf *bp; struct ext2mount *ump; struct mount *mp; struct indir a[NIADDR+1], *ap; daddr_t daddr; e2fs_lbn_t metalbn; int error, num, maxrun = 0, bsize; int *nump; ap = NULL; ip = VTOI(vp); mp = vp->v_mount; ump = VFSTOEXT2(mp); bsize = EXT2_BLOCK_SIZE(ump->um_e2fs); if (runp) { maxrun = mp->mnt_iosize_max / bsize - 1; *runp = 0; } if (runb) { *runb = 0; } ap = a; nump = # error = ext2_getlbns(vp, bn, ap, nump); if (error) return (error); num = *nump; if (num == 0) { *bnp = blkptrtodb(ump, ip->i_db[bn]); if (*bnp == 0) { *bnp = -1; } else if (runp) { daddr_t bnb = bn; for (++bn; bn < NDADDR && *runp < maxrun && is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); ++bn, ++*runp); bn = bnb; if (runb && (bn > 0)) { for (--bn; (bn >= 0) && (*runb < maxrun) && is_sequential(ump, ip->i_db[bn], ip->i_db[bn + 1]); --bn, ++*runb); } } return (0); } /* Get disk address out of indirect block array */ daddr = ip->i_ib[ap->in_off]; for (bp = NULL, ++ap; --num; ++ap) { /* * Exit the loop if there is no disk address assigned yet and * the indirect block isn't in the cache, or if we were * looking for an indirect block and we've found it. */ metalbn = ap->in_lbn; if ((daddr == 0 && !incore(&vp->v_bufobj, metalbn)) || metalbn == bn) break; /* * If we get here, we've either got the block in the cache * or we have a disk address for it, go fetch it. */ if (bp) bqrelse(bp); bp = getblk(vp, metalbn, bsize, 0, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { #ifdef INVARIANTS if (!daddr) panic("ext2_bmaparray: indirect block not in cache"); #endif bp->b_blkno = blkptrtodb(ump, daddr); bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif curthread->td_ru.ru_inblock++; error = bufwait(bp); if (error) { brelse(bp); return (error); } } daddr = ((e2fs_daddr_t *)bp->b_data)[ap->in_off]; if (num == 1 && daddr && runp) { for (bn = ap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, ((e2fs_daddr_t *)bp->b_data)[bn - 1], ((e2fs_daddr_t *)bp->b_data)[bn]); ++bn, ++*runp); bn = ap->in_off; if (runb && bn) { for (--bn; bn >= 0 && *runb < maxrun && is_sequential(ump, ((e2fs_daddr_t *)bp->b_data)[bn], ((e2fs_daddr_t *)bp->b_data)[bn + 1]); --bn, ++*runb); } } } if (bp) bqrelse(bp); /* * Since this is FFS independent code, we are out of scope for the * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they * will fall in the range 1..um_seqinc, so we use that test and * return a request for a zeroed out buffer if attempts are made * to read a BLK_NOCOPY or BLK_SNAP block. */ if ((ip->i_flags & SF_SNAPSHOT) && daddr > 0 && daddr < ump->um_seqinc){ *bnp = -1; return (0); } *bnp = blkptrtodb(ump, daddr); if (*bnp == 0) { *bnp = -1; } return (0); }
static void deadlkres(void) { struct proc *p; struct thread *td; void *wchan; int blkticks, i, slpticks, slptype, tryl, tticks; tryl = 0; for (;;) { blkticks = blktime_threshold * hz; slpticks = slptime_threshold * hz; /* * Avoid to sleep on the sx_lock in order to avoid a possible * priority inversion problem leading to starvation. * If the lock can't be held after 100 tries, panic. */ if (!sx_try_slock(&allproc_lock)) { if (tryl > 100) panic("%s: possible deadlock detected on allproc_lock\n", __func__); tryl++; pause("allproc", sleepfreq * hz); continue; } tryl = 0; FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } FOREACH_THREAD_IN_PROC(p, td) { /* * Once a thread is found in "interesting" * state a possible ticks wrap-up needs to be * checked. */ thread_lock(td); if (TD_ON_LOCK(td) && ticks < td->td_blktick) { /* * The thread should be blocked on a * turnstile, simply check if the * turnstile channel is in good state. */ MPASS(td->td_blocked != NULL); tticks = ticks - td->td_blktick; thread_unlock(td); if (tticks > blkticks) { /* * Accordingly with provided * thresholds, this thread is * stuck for too long on a * turnstile. */ PROC_UNLOCK(p); sx_sunlock(&allproc_lock); panic("%s: possible deadlock detected for %p, blocked for %d ticks\n", __func__, td, tticks); } } else if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td) && ticks < td->td_blktick) { /* * Check if the thread is sleeping on a * lock, otherwise skip the check. * Drop the thread lock in order to * avoid a LOR with the sleepqueue * spinlock. */ wchan = td->td_wchan; tticks = ticks - td->td_slptick; thread_unlock(td); slptype = sleepq_type(wchan); if ((slptype == SLEEPQ_SX || slptype == SLEEPQ_LK) && tticks > slpticks) { /* * Accordingly with provided * thresholds, this thread is * stuck for too long on a * sleepqueue. * However, being on a * sleepqueue, we might still * check for the blessed * list. */ tryl = 0; for (i = 0; blessed[i] != NULL; i++) { if (!strcmp(blessed[i], td->td_wmesg)) { tryl = 1; break; } } if (tryl != 0) { tryl = 0; continue; } PROC_UNLOCK(p); sx_sunlock(&allproc_lock); panic("%s: possible deadlock detected for %p, blocked for %d ticks\n", __func__, td, tticks); } } else thread_unlock(td); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); /* Sleep for sleepfreq seconds. */ pause("-", sleepfreq * hz); }
int linux_clone(struct thread *td, struct linux_clone_args *args) { int error, ff = RFPROC | RFSTOPPED; struct proc *p2; struct thread *td2; int exit_signal; struct linux_emuldata *em; #ifdef DEBUG if (ldebug(clone)) { printf(ARGS(clone, "flags %x, stack %p, parent tid: %p, " "child tid: %p"), (unsigned)args->flags, args->stack, args->parent_tidptr, args->child_tidptr); } #endif exit_signal = args->flags & 0x000000ff; if (LINUX_SIG_VALID(exit_signal)) { if (exit_signal <= LINUX_SIGTBLSZ) exit_signal = linux_to_bsd_signal[_SIG_IDX(exit_signal)]; } else if (exit_signal != 0) return (EINVAL); if (args->flags & LINUX_CLONE_VM) ff |= RFMEM; if (args->flags & LINUX_CLONE_SIGHAND) ff |= RFSIGSHARE; /* * XXX: In Linux, sharing of fs info (chroot/cwd/umask) * and open files is independant. In FreeBSD, its in one * structure but in reality it does not cause any problems * because both of these flags are usually set together. */ if (!(args->flags & (LINUX_CLONE_FILES | LINUX_CLONE_FS))) ff |= RFFDG; /* * Attempt to detect when linux_clone(2) is used for creating * kernel threads. Unfortunately despite the existence of the * CLONE_THREAD flag, version of linuxthreads package used in * most popular distros as of beginning of 2005 doesn't make * any use of it. Therefore, this detection relies on * empirical observation that linuxthreads sets certain * combination of flags, so that we can make more or less * precise detection and notify the FreeBSD kernel that several * processes are in fact part of the same threading group, so * that special treatment is necessary for signal delivery * between those processes and fd locking. */ if ((args->flags & 0xffffff00) == LINUX_THREADING_FLAGS) ff |= RFTHREAD; if (args->flags & LINUX_CLONE_PARENT_SETTID) if (args->parent_tidptr == NULL) return (EINVAL); error = fork1(td, ff, 0, &p2, NULL, 0); if (error) return (error); if (args->flags & (LINUX_CLONE_PARENT | LINUX_CLONE_THREAD)) { sx_xlock(&proctree_lock); PROC_LOCK(p2); proc_reparent(p2, td->td_proc->p_pptr); PROC_UNLOCK(p2); sx_xunlock(&proctree_lock); } /* create the emuldata */ error = linux_proc_init(td, p2->p_pid, args->flags); /* reference it - no need to check this */ em = em_find(p2, EMUL_DOLOCK); KASSERT(em != NULL, ("clone: emuldata not found.")); /* and adjust it */ if (args->flags & LINUX_CLONE_THREAD) { #ifdef notyet PROC_LOCK(p2); p2->p_pgrp = td->td_proc->p_pgrp; PROC_UNLOCK(p2); #endif exit_signal = 0; } if (args->flags & LINUX_CLONE_CHILD_SETTID) em->child_set_tid = args->child_tidptr; else em->child_set_tid = NULL; if (args->flags & LINUX_CLONE_CHILD_CLEARTID) em->child_clear_tid = args->child_tidptr; else em->child_clear_tid = NULL; EMUL_UNLOCK(&emul_lock); if (args->flags & LINUX_CLONE_PARENT_SETTID) { error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid)); if (error) printf(LMSG("copyout failed!")); } PROC_LOCK(p2); p2->p_sigparent = exit_signal; PROC_UNLOCK(p2); td2 = FIRST_THREAD_IN_PROC(p2); /* * In a case of stack = NULL, we are supposed to COW calling process * stack. This is what normal fork() does, so we just keep tf_rsp arg * intact. */ if (args->stack) linux_set_upcall_kse(td2, PTROUT(args->stack)); if (args->flags & LINUX_CLONE_SETTLS) linux_set_cloned_tls(td2, args->tls); #ifdef DEBUG if (ldebug(clone)) printf(LMSG("clone: successful rfork to %d, " "stack %p sig = %d"), (int)p2->p_pid, args->stack, exit_signal); #endif if (args->flags & LINUX_CLONE_VFORK) { PROC_LOCK(p2); p2->p_flag |= P_PPWAIT; PROC_UNLOCK(p2); } /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; if (args->flags & LINUX_CLONE_VFORK) { /* wait for the children to exit, ie. emulate vfork */ PROC_LOCK(p2); while (p2->p_flag & P_PPWAIT) cv_wait(&p2->p_pwait, &p2->p_mtx); PROC_UNLOCK(p2); } return (0); }
void ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct ia32_sigframe sf, *sfp; struct siginfo32 siginfo; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; char *xfpusave; size_t xfpusave_len; int oonstack; int sig; siginfo_to_siginfo32(&ksi->ksi_info, &siginfo); td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = siginfo.si_signo; psp = p->p_sigacts; #ifdef COMPAT_FREEBSD4 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { freebsd4_ia32_sendsig(catcher, ksi, mask); return; } #endif #ifdef COMPAT_43 if (SIGISMEMBER(psp->ps_osigset, sig)) { ia32_osendsig(catcher, ksi, mask); return; } #endif mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); xfpusave = __builtin_alloca(xfpusave_len); } else { xfpusave_len = 0; xfpusave = NULL; } /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp; sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_edi = regs->tf_rdi; sf.sf_uc.uc_mcontext.mc_esi = regs->tf_rsi; sf.sf_uc.uc_mcontext.mc_ebp = regs->tf_rbp; sf.sf_uc.uc_mcontext.mc_isp = regs->tf_rsp; /* XXX */ sf.sf_uc.uc_mcontext.mc_ebx = regs->tf_rbx; sf.sf_uc.uc_mcontext.mc_edx = regs->tf_rdx; sf.sf_uc.uc_mcontext.mc_ecx = regs->tf_rcx; sf.sf_uc.uc_mcontext.mc_eax = regs->tf_rax; sf.sf_uc.uc_mcontext.mc_trapno = regs->tf_trapno; sf.sf_uc.uc_mcontext.mc_err = regs->tf_err; sf.sf_uc.uc_mcontext.mc_eip = regs->tf_rip; sf.sf_uc.uc_mcontext.mc_cs = regs->tf_cs; sf.sf_uc.uc_mcontext.mc_eflags = regs->tf_rflags; sf.sf_uc.uc_mcontext.mc_esp = regs->tf_rsp; sf.sf_uc.uc_mcontext.mc_ss = regs->tf_ss; sf.sf_uc.uc_mcontext.mc_ds = regs->tf_ds; sf.sf_uc.uc_mcontext.mc_es = regs->tf_es; sf.sf_uc.uc_mcontext.mc_fs = regs->tf_fs; sf.sf_uc.uc_mcontext.mc_gs = regs->tf_gs; sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ ia32_get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); fpstate_drop(td); sf.sf_uc.uc_mcontext.mc_fsbase = td->td_pcb->pcb_fsbase; sf.sf_uc.uc_mcontext.mc_gsbase = td->td_pcb->pcb_gsbase; bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) sp = td->td_sigstk.ss_sp + td->td_sigstk.ss_size; else sp = (char *)regs->tf_rsp; if (xfpusave != NULL) { sp -= xfpusave_len; sp = (char *)((unsigned long)sp & ~0x3Ful); sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; } sp -= sizeof(sf); /* Align to 16 bytes. */ sfp = (struct ia32_sigframe *)((uintptr_t)sp & ~0xF); PROC_UNLOCK(p); /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (u_int32_t)(uintptr_t)&sfp->sf_si; sf.sf_ah = (u_int32_t)(uintptr_t)catcher; /* Fill in POSIX parts */ sf.sf_si = siginfo; sf.sf_si.si_signo = sig; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = siginfo.si_code; sf.sf_addr = (u_int32_t)siginfo.si_addr; sf.sf_ah = (u_int32_t)(uintptr_t)catcher; } mtx_unlock(&psp->ps_mtx); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || (xfpusave != NULL && copyout(xfpusave, PTRIN(sf.sf_uc.uc_mcontext.mc_xfpustate), xfpusave_len) != 0)) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (uintptr_t)sfp; regs->tf_rip = p->p_sysent->sv_sigcode_base; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); /* XXXKIB leave user %fs and %gs untouched */ PROC_LOCK(p); mtx_lock(&psp->ps_mtx); }
static void freebsd4_ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct ia32_sigframe4 sf, *sfp; struct siginfo32 siginfo; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int oonstack; int sig; td = curthread; p = td->td_proc; siginfo_to_siginfo32(&ksi->ksi_info, &siginfo); PROC_LOCK_ASSERT(p, MA_OWNED); sig = siginfo.si_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp; sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_edi = regs->tf_rdi; sf.sf_uc.uc_mcontext.mc_esi = regs->tf_rsi; sf.sf_uc.uc_mcontext.mc_ebp = regs->tf_rbp; sf.sf_uc.uc_mcontext.mc_isp = regs->tf_rsp; /* XXX */ sf.sf_uc.uc_mcontext.mc_ebx = regs->tf_rbx; sf.sf_uc.uc_mcontext.mc_edx = regs->tf_rdx; sf.sf_uc.uc_mcontext.mc_ecx = regs->tf_rcx; sf.sf_uc.uc_mcontext.mc_eax = regs->tf_rax; sf.sf_uc.uc_mcontext.mc_trapno = regs->tf_trapno; sf.sf_uc.uc_mcontext.mc_err = regs->tf_err; sf.sf_uc.uc_mcontext.mc_eip = regs->tf_rip; sf.sf_uc.uc_mcontext.mc_cs = regs->tf_cs; sf.sf_uc.uc_mcontext.mc_eflags = regs->tf_rflags; sf.sf_uc.uc_mcontext.mc_esp = regs->tf_rsp; sf.sf_uc.uc_mcontext.mc_ss = regs->tf_ss; sf.sf_uc.uc_mcontext.mc_ds = regs->tf_ds; sf.sf_uc.uc_mcontext.mc_es = regs->tf_es; sf.sf_uc.uc_mcontext.mc_fs = regs->tf_fs; sf.sf_uc.uc_mcontext.mc_gs = regs->tf_gs; bzero(sf.sf_uc.uc_mcontext.mc_fpregs, sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); bzero(sf.sf_uc.uc_mcontext.__spare__, sizeof(sf.sf_uc.uc_mcontext.__spare__)); bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct ia32_sigframe4 *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(sf)); } else sfp = (struct ia32_sigframe4 *)regs->tf_rsp - 1; PROC_UNLOCK(p); /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (u_int32_t)(uintptr_t)&sfp->sf_si; sf.sf_ah = (u_int32_t)(uintptr_t)catcher; /* Fill in POSIX parts */ sf.sf_si = siginfo; sf.sf_si.si_signo = sig; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = siginfo.si_code; sf.sf_addr = (u_int32_t)siginfo.si_addr; sf.sf_ah = (u_int32_t)(uintptr_t)catcher; } mtx_unlock(&psp->ps_mtx); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (uintptr_t)sfp; regs->tf_rip = p->p_sysent->sv_sigcode_base + sz_ia32_sigcode - sz_freebsd4_ia32_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); /* leave user %fs and %gs untouched */ PROC_LOCK(p); mtx_lock(&psp->ps_mtx); }
int thread_create(struct thread *td, struct rtprio *rtp, int (*initialize_thread)(struct thread *, void *), void *thunk) { struct thread *newtd; struct proc *p; int error; p = td->td_proc; if (rtp != NULL) { switch(rtp->type) { case RTP_PRIO_REALTIME: case RTP_PRIO_FIFO: /* Only root can set scheduler policy */ if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0) return (EPERM); if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); break; case RTP_PRIO_NORMAL: rtp->prio = 0; break; default: return (EINVAL); } } #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_add(p, RACCT_NTHR, 1); PROC_UNLOCK(p); if (error != 0) return (EPROCLIM); } #endif /* Initialize our td */ error = kern_thr_alloc(p, 0, &newtd); if (error) goto fail; cpu_set_upcall(newtd, td); bzero(&newtd->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &newtd->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); newtd->td_proc = td->td_proc; thread_cow_get(newtd, td); error = initialize_thread(newtd, thunk); if (error != 0) { thread_cow_free(newtd); thread_free(newtd); goto fail; } PROC_LOCK(p); p->p_flag |= P_HADTHREADS; thread_link(newtd, p); bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name)); newtd->td_pax = p->p_pax; thread_lock(td); /* let the scheduler know about these things. */ sched_fork_thread(td, newtd); thread_unlock(td); if (P_SHOULDSTOP(p)) newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (p->p_flag2 & P2_LWP_EVENTS) newtd->td_dbgflags |= TDB_BORN; /* * Copy the existing thread VM policy into the new thread. */ vm_domain_policy_localcopy(&newtd->td_vm_dom_policy, &td->td_vm_dom_policy); PROC_UNLOCK(p); tidhash_add(newtd); thread_lock(newtd); if (rtp != NULL) { if (!(td->td_pri_class == PRI_TIMESHARE && rtp->type == RTP_PRIO_NORMAL)) { rtp_to_pri(rtp, newtd); sched_prio(newtd, newtd->td_user_pri); } /* ignore timesharing class */ } TD_SET_CAN_RUN(newtd); sched_add(newtd, SRQ_BORING); thread_unlock(newtd); return (0); fail: #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_sub(p, RACCT_NTHR, 1); PROC_UNLOCK(p); } #endif return (error); }
int cheriabi_sysarch(struct thread *td, struct cheriabi_sysarch_args *uap) { struct trapframe *regs = &td->td_pcb->pcb_regs; int error; int parms_from_cap = 1; size_t reqsize; register_t reqperms; /* * The sysarch() fill_uap function is machine-independent so can not * check the validity of the capabilty which becomes uap->parms. As * such, it makes no attempt to convert the result. We need to * perform those checks here. */ switch (uap->op) { case MIPS_SET_TLS: reqsize = 0; reqperms = 0; break; case MIPS_GET_TLS: case CHERI_GET_STACK: case CHERI_GET_TYPECAP: reqsize = sizeof(struct chericap); reqperms = CHERI_PERM_STORE|CHERI_PERM_STORE_CAP; break; case CHERI_SET_STACK: reqsize = sizeof(struct chericap); reqperms = CHERI_PERM_LOAD|CHERI_PERM_LOAD_CAP; break; case CHERI_MMAP_GETBASE: case CHERI_MMAP_GETLEN: case CHERI_MMAP_GETOFFSET: case CHERI_MMAP_GETPERM: case CHERI_MMAP_SETOFFSET: case CHERI_MMAP_SETBOUNDS: reqsize = sizeof(uint64_t); reqperms = CHERI_PERM_STORE; break; case CHERI_MMAP_ANDPERM: reqsize = sizeof(uint64_t); reqperms = CHERI_PERM_LOAD|CHERI_PERM_STORE; break; case MIPS_GET_COUNT: parms_from_cap = 0; break; #ifdef CPU_QEMU_MALTA case QEMU_GET_QTRACE: reqsize = sizeof(int); reqperms = CHERI_PERM_STORE; break; case QEMU_SET_QTRACE: reqsize = sizeof(int); reqperms = CHERI_PERM_LOAD; break; #endif default: return (EINVAL); } if (parms_from_cap) { error = cheriabi_cap_to_ptr(&uap->parms, ®s->c3, reqsize, reqperms, 0); if (error != 0) return (error); } switch (uap->op) { case MIPS_SET_TLS: return (cheriabi_set_user_tls(td, ®s->c3)); case MIPS_GET_TLS: error = copyoutcap(&td->td_md.md_tls_cap, uap->parms, sizeof(struct chericap)); return (error); case CHERI_MMAP_GETBASE: { size_t base; PROC_LOCK(td->td_proc); CHERI_CLC(CHERI_CR_CTEMP0, CHERI_CR_KDC, &td->td_proc->p_md.md_cheri_mmap_cap, 0); CHERI_CGETBASE(base, CHERI_CR_CTEMP0); PROC_UNLOCK(td->td_proc); if (suword64(uap->parms, base) != 0) return (EFAULT); return (0); } case CHERI_MMAP_GETLEN: { size_t len; PROC_LOCK(td->td_proc); CHERI_CLC(CHERI_CR_CTEMP0, CHERI_CR_KDC, &td->td_proc->p_md.md_cheri_mmap_cap, 0); CHERI_CGETLEN(len, CHERI_CR_CTEMP0); PROC_UNLOCK(td->td_proc); if (suword64(uap->parms, len) != 0) return (EFAULT); return (0); } case CHERI_MMAP_GETOFFSET: { ssize_t offset; PROC_LOCK(td->td_proc); CHERI_CLC(CHERI_CR_CTEMP0, CHERI_CR_KDC, &td->td_proc->p_md.md_cheri_mmap_cap, 0); CHERI_CGETOFFSET(offset, CHERI_CR_CTEMP0); PROC_UNLOCK(td->td_proc); if (suword64(uap->parms, offset) != 0) return (EFAULT); return (0); } case CHERI_MMAP_GETPERM: { uint64_t perms; PROC_LOCK(td->td_proc); CHERI_CLC(CHERI_CR_CTEMP0, CHERI_CR_KDC, &td->td_proc->p_md.md_cheri_mmap_cap, 0); CHERI_CGETPERM(perms, CHERI_CR_CTEMP0); PROC_UNLOCK(td->td_proc); if (suword64(uap->parms, perms) != 0) return (EFAULT); return (0); } case CHERI_MMAP_ANDPERM: { uint64_t perms; perms = fuword64(uap->parms); if (perms == -1) return (EINVAL); PROC_LOCK(td->td_proc); CHERI_CLC(CHERI_CR_CTEMP0, CHERI_CR_KDC, &td->td_proc->p_md.md_cheri_mmap_cap, 0); CHERI_CANDPERM(CHERI_CR_CTEMP0, CHERI_CR_CTEMP0, (register_t)perms); CHERI_CSC(CHERI_CR_CTEMP0, CHERI_CR_KDC, &td->td_proc->p_md.md_cheri_mmap_cap, 0); CHERI_CGETPERM(perms, CHERI_CR_CTEMP0); PROC_UNLOCK(td->td_proc); if (suword64(uap->parms, perms) != 0) return (EFAULT); return (0); } case CHERI_MMAP_SETOFFSET: { size_t len; ssize_t offset; offset = fuword64(uap->parms); /* Reject errors and misaligned offsets */ if (offset == -1 || (offset & PAGE_MASK) != 0) return (EINVAL); PROC_LOCK(td->td_proc); CHERI_CLC(CHERI_CR_CTEMP0, CHERI_CR_KDC, &td->td_proc->p_md.md_cheri_mmap_cap, 0); CHERI_CGETLEN(len, CHERI_CR_CTEMP0); /* Don't allow out of bounds offsets, they aren't useful */ if (offset < 0 || offset > len) { PROC_UNLOCK(td->td_proc); return (EINVAL); } CHERI_CSETOFFSET(CHERI_CR_CTEMP0, CHERI_CR_CTEMP0, (register_t)offset); CHERI_CSC(CHERI_CR_CTEMP0, CHERI_CR_KDC, &td->td_proc->p_md.md_cheri_mmap_cap, 0); PROC_UNLOCK(td->td_proc); return (0); } case CHERI_MMAP_SETBOUNDS: { size_t len, olen; ssize_t offset; len = fuword64(uap->parms); /* Reject errors or misaligned lengths */ if (len == (size_t)-1 || (len & PAGE_MASK) != 0) return (EINVAL); PROC_LOCK(td->td_proc); CHERI_CLC(CHERI_CR_CTEMP0, CHERI_CR_KDC, &td->td_proc->p_md.md_cheri_mmap_cap, 0); CHERI_CGETLEN(olen, CHERI_CR_CTEMP0); CHERI_CGETOFFSET(offset, CHERI_CR_CTEMP0); /* Don't try to set out of bounds lengths */ if (offset > olen || len > olen - offset) { PROC_UNLOCK(td->td_proc); return (EINVAL); } CHERI_CSETBOUNDS(CHERI_CR_CTEMP0, CHERI_CR_CTEMP0, (register_t)len); CHERI_CSC(CHERI_CR_CTEMP0, CHERI_CR_KDC, &td->td_proc->p_md.md_cheri_mmap_cap, 0); PROC_UNLOCK(td->td_proc); return (0); } default: return (sysarch(td, (struct sysarch_args*)uap)); } }
void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td; struct proc *p; struct trapframe *tf; struct sigframe *fp, frame; struct sigacts *psp; int code, onstack, sig; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; code = ksi->ksi_code; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; onstack = sigonstack(tf->tf_sp); CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm, catcher, sig); /* Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !onstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct sigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else { fp = (struct sigframe *)td->td_frame->tf_sp; } /* Make room, keeping the stack aligned */ fp--; fp = (struct sigframe *)STACKALIGN(fp); /* Fill in the frame to copy out */ get_mcontext(td, &frame.sf_uc.uc_mcontext, 0); get_fpcontext(td, &frame.sf_uc.uc_mcontext); frame.sf_si = ksi->ksi_info; frame.sf_uc.uc_sigmask = *mask; frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((onstack) ? SS_ONSTACK : 0) : SS_DISABLE; frame.sf_uc.uc_stack = td->td_sigstk; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(td->td_proc); /* Copy the sigframe out to the user's stack. */ if (copyout(&frame, fp, sizeof(*fp)) != 0) { /* Process has trashed its stack. Kill it. */ CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp); PROC_LOCK(p); sigexit(td, SIGILL); } tf->tf_x[0]= sig; tf->tf_x[1] = (register_t)&fp->sf_si; tf->tf_x[2] = (register_t)&fp->sf_uc; tf->tf_elr = (register_t)catcher; tf->tf_sp = (register_t)fp; tf->tf_lr = (register_t)(PS_STRINGS - *(p->p_sysent->sv_szsigcode)); CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_elr, tf->tf_sp); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); }
static int linux_clone_proc(struct thread *td, struct linux_clone_args *args) { struct fork_req fr; int error, ff = RFPROC | RFSTOPPED; struct proc *p2; struct thread *td2; int exit_signal; struct linux_emuldata *em; #ifdef DEBUG if (ldebug(clone)) { printf(ARGS(clone, "flags %x, stack %p, parent tid: %p, " "child tid: %p"), (unsigned)args->flags, args->stack, args->parent_tidptr, args->child_tidptr); } #endif exit_signal = args->flags & 0x000000ff; if (LINUX_SIG_VALID(exit_signal)) { exit_signal = linux_to_bsd_signal(exit_signal); } else if (exit_signal != 0) return (EINVAL); if (args->flags & LINUX_CLONE_VM) ff |= RFMEM; if (args->flags & LINUX_CLONE_SIGHAND) ff |= RFSIGSHARE; /* * XXX: In Linux, sharing of fs info (chroot/cwd/umask) * and open files is independent. In FreeBSD, its in one * structure but in reality it does not cause any problems * because both of these flags are usually set together. */ if (!(args->flags & (LINUX_CLONE_FILES | LINUX_CLONE_FS))) ff |= RFFDG; if (args->flags & LINUX_CLONE_PARENT_SETTID) if (args->parent_tidptr == NULL) return (EINVAL); if (args->flags & LINUX_CLONE_VFORK) ff |= RFPPWAIT; bzero(&fr, sizeof(fr)); fr.fr_flags = ff; fr.fr_procp = &p2; error = fork1(td, &fr); if (error) return (error); td2 = FIRST_THREAD_IN_PROC(p2); /* create the emuldata */ linux_proc_init(td, td2, args->flags); em = em_find(td2); KASSERT(em != NULL, ("clone_proc: emuldata not found.\n")); if (args->flags & LINUX_CLONE_CHILD_SETTID) em->child_set_tid = args->child_tidptr; else em->child_set_tid = NULL; if (args->flags & LINUX_CLONE_CHILD_CLEARTID) em->child_clear_tid = args->child_tidptr; else em->child_clear_tid = NULL; if (args->flags & LINUX_CLONE_PARENT_SETTID) { error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid)); if (error) printf(LMSG("copyout failed!")); } PROC_LOCK(p2); p2->p_sigparent = exit_signal; PROC_UNLOCK(p2); /* * In a case of stack = NULL, we are supposed to COW calling process * stack. This is what normal fork() does, so we just keep tf_rsp arg * intact. */ linux_set_upcall_kse(td2, PTROUT(args->stack)); if (args->flags & LINUX_CLONE_SETTLS) linux_set_cloned_tls(td2, args->tls); /* * If CLONE_PARENT is set, then the parent of the new process will be * the same as that of the calling process. */ if (args->flags & LINUX_CLONE_PARENT) { sx_xlock(&proctree_lock); PROC_LOCK(p2); proc_reparent(p2, td->td_proc->p_pptr); PROC_UNLOCK(p2); sx_xunlock(&proctree_lock); } #ifdef DEBUG if (ldebug(clone)) printf(LMSG("clone: successful rfork to %d, " "stack %p sig = %d"), (int)p2->p_pid, args->stack, exit_signal); #endif /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); td->td_retval[0] = p2->p_pid; return (0); }
/* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void freebsd32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct proc *p; struct thread *td; struct fpreg32 fpregs; struct reg32 regs; struct sigacts *psp; struct sigframe32 sf, *sfp; int sig; int oonstack; unsigned i; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); fill_regs32(td, ®s); oonstack = sigonstack(td->td_frame->sp); /* save user context */ bzero(&sf, sizeof sf); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack.ss_sp = (int32_t)(intptr_t)td->td_sigstk.ss_sp; sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size; sf.sf_uc.uc_stack.ss_flags = td->td_sigstk.ss_flags; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_pc = regs.r_regs[PC]; sf.sf_uc.uc_mcontext.mullo = regs.r_regs[MULLO]; sf.sf_uc.uc_mcontext.mulhi = regs.r_regs[MULHI]; sf.sf_uc.uc_mcontext.mc_tls = (int32_t)(intptr_t)td->td_md.md_tls; sf.sf_uc.uc_mcontext.mc_regs[0] = UCONTEXT_MAGIC; /* magic number */ for (i = 1; i < 32; i++) sf.sf_uc.uc_mcontext.mc_regs[i] = regs.r_regs[i]; sf.sf_uc.uc_mcontext.mc_fpused = td->td_md.md_flags & MDTD_FPUSED; if (sf.sf_uc.uc_mcontext.mc_fpused) { /* if FPU has current state, save it first */ if (td == PCPU_GET(fpcurthread)) MipsSaveCurFPState(td); fill_fpregs32(td, &fpregs); for (i = 0; i < 33; i++) sf.sf_uc.uc_mcontext.mc_fpregs[i] = fpregs.r_regs[i]; } /* Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe32 *)(((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct sigframe32)) & ~(sizeof(__int64_t) - 1)); } else sfp = (struct sigframe32 *)((vm_offset_t)(td->td_frame->sp - sizeof(struct sigframe32)) & ~(sizeof(__int64_t) - 1)); /* Build the argument list for the signal handler. */ td->td_frame->a0 = sig; td->td_frame->a2 = (register_t)(intptr_t)&sfp->sf_uc; if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ td->td_frame->a1 = (register_t)(intptr_t)&sfp->sf_si; /* sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; */ /* fill siginfo structure */ sf.sf_si.si_signo = sig; sf.sf_si.si_code = ksi->ksi_code; sf.sf_si.si_addr = td->td_frame->badvaddr; } else { /* Old FreeBSD-style arguments. */ td->td_frame->a1 = ksi->ksi_code; td->td_frame->a3 = td->td_frame->badvaddr; /* sf.sf_ahu.sf_handler = catcher; */ } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(struct sigframe32)) != 0) { /* * Something is wrong with the stack pointer. * ...Kill the process. */ PROC_LOCK(p); sigexit(td, SIGILL); } td->td_frame->pc = (register_t)(intptr_t)catcher; td->td_frame->t9 = (register_t)(intptr_t)catcher; td->td_frame->sp = (register_t)(intptr_t)sfp; /* * Signal trampoline code is at base of user stack. */ td->td_frame->ra = (register_t)(intptr_t)p->p_psstrings - *(p->p_sysent->sv_szsigcode); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); }
static int linux_clone_thread(struct thread *td, struct linux_clone_args *args) { struct linux_emuldata *em; struct thread *newtd; struct proc *p; int error; #ifdef DEBUG if (ldebug(clone)) { printf(ARGS(clone, "thread: flags %x, stack %p, parent tid: %p, " "child tid: %p"), (unsigned)args->flags, args->stack, args->parent_tidptr, args->child_tidptr); } #endif LINUX_CTR4(clone_thread, "thread(%d) flags %x ptid %p ctid %p", td->td_tid, (unsigned)args->flags, args->parent_tidptr, args->child_tidptr); if (args->flags & LINUX_CLONE_PARENT_SETTID) if (args->parent_tidptr == NULL) return (EINVAL); /* Threads should be created with own stack */ if (args->stack == NULL) return (EINVAL); p = td->td_proc; #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_add(p, RACCT_NTHR, 1); PROC_UNLOCK(p); if (error != 0) return (EPROCLIM); } #endif /* Initialize our td */ error = kern_thr_alloc(p, 0, &newtd); if (error) goto fail; cpu_set_upcall(newtd, td); bzero(&newtd->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &newtd->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); newtd->td_proc = p; thread_cow_get(newtd, td); /* create the emuldata */ linux_proc_init(td, newtd, args->flags); em = em_find(newtd); KASSERT(em != NULL, ("clone_thread: emuldata not found.\n")); if (args->flags & LINUX_CLONE_SETTLS) linux_set_cloned_tls(newtd, args->tls); if (args->flags & LINUX_CLONE_CHILD_SETTID) em->child_set_tid = args->child_tidptr; else em->child_set_tid = NULL; if (args->flags & LINUX_CLONE_CHILD_CLEARTID) em->child_clear_tid = args->child_tidptr; else em->child_clear_tid = NULL; cpu_thread_clean(newtd); linux_set_upcall_kse(newtd, PTROUT(args->stack)); PROC_LOCK(p); p->p_flag |= P_HADTHREADS; bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name)); if (args->flags & LINUX_CLONE_PARENT) thread_link(newtd, p->p_pptr); else thread_link(newtd, p); thread_lock(td); /* let the scheduler know about these things. */ sched_fork_thread(td, newtd); thread_unlock(td); if (P_SHOULDSTOP(p)) newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; PROC_UNLOCK(p); tidhash_add(newtd); #ifdef DEBUG if (ldebug(clone)) printf(ARGS(clone, "successful clone to %d, stack %p"), (int)newtd->td_tid, args->stack); #endif LINUX_CTR2(clone_thread, "thread(%d) successful clone to %d", td->td_tid, newtd->td_tid); if (args->flags & LINUX_CLONE_PARENT_SETTID) { error = copyout(&newtd->td_tid, args->parent_tidptr, sizeof(newtd->td_tid)); if (error) printf(LMSG("clone_thread: copyout failed!")); } /* * Make this runnable after we are finished with it. */ thread_lock(newtd); TD_SET_CAN_RUN(newtd); sched_add(newtd, SRQ_BORING); thread_unlock(newtd); td->td_retval[0] = newtd->td_tid; return (0); fail: #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_sub(p, RACCT_NTHR, 1); PROC_UNLOCK(p); } #endif return (error); }
/* * copied from amd64/amd64/machdep.c * * Send an interrupt to process. */ static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct l_rt_sigframe sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; caddr_t sp; struct trapframe *regs; int sig, code; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; code = ksi->ksi_code; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); LINUX_CTR4(rt_sendsig, "%p, %d, %p, %u", catcher, sig, mask, code); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = (caddr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe); } else sp = (caddr_t)regs->tf_rsp - sizeof(struct l_rt_sigframe) - 128; /* Align to 16 bytes. */ sfp = (struct l_rt_sigframe *)((unsigned long)sp & ~0xFul); mtx_unlock(&psp->ps_mtx); /* Translate the signal. */ sig = bsd_to_linux_signal(sig); /* Save user context. */ bzero(&sf, sizeof(sf)); bsd_to_linux_sigset(mask, &sf.sf_sc.uc_sigmask); bsd_to_linux_sigset(mask, &sf.sf_sc.uc_mcontext.sc_mask); sf.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); sf.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; sf.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); sf.sf_sc.uc_mcontext.sc_rdi = regs->tf_rdi; sf.sf_sc.uc_mcontext.sc_rsi = regs->tf_rsi; sf.sf_sc.uc_mcontext.sc_rdx = regs->tf_rdx; sf.sf_sc.uc_mcontext.sc_rbp = regs->tf_rbp; sf.sf_sc.uc_mcontext.sc_rbx = regs->tf_rbx; sf.sf_sc.uc_mcontext.sc_rcx = regs->tf_rcx; sf.sf_sc.uc_mcontext.sc_rax = regs->tf_rax; sf.sf_sc.uc_mcontext.sc_rip = regs->tf_rip; sf.sf_sc.uc_mcontext.sc_rsp = regs->tf_rsp; sf.sf_sc.uc_mcontext.sc_r8 = regs->tf_r8; sf.sf_sc.uc_mcontext.sc_r9 = regs->tf_r9; sf.sf_sc.uc_mcontext.sc_r10 = regs->tf_r10; sf.sf_sc.uc_mcontext.sc_r11 = regs->tf_r11; sf.sf_sc.uc_mcontext.sc_r12 = regs->tf_r12; sf.sf_sc.uc_mcontext.sc_r13 = regs->tf_r13; sf.sf_sc.uc_mcontext.sc_r14 = regs->tf_r14; sf.sf_sc.uc_mcontext.sc_r15 = regs->tf_r15; sf.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; sf.sf_sc.uc_mcontext.sc_rflags = regs->tf_rflags; sf.sf_sc.uc_mcontext.sc_err = regs->tf_err; sf.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); sf.sf_sc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr; /* Build the argument list for the signal handler. */ regs->tf_rdi = sig; /* arg 1 in %rdi */ regs->tf_rax = 0; regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ regs->tf_rdx = (register_t)&sfp->sf_sc; /* arg 3 in %rdx */ sf.sf_handler = catcher; /* Fill in POSIX parts */ ksiginfo_to_lsiginfo(ksi, &sf.sf_si, sig); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (long)sfp; regs->tf_rip = linux_rt_sigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); }
static inline int syscallenter(struct thread *td, struct syscall_args *sa) { struct proc *p; int error, traced; PCPU_INC(cnt.v_syscall); p = td->td_proc; td->td_pticks = 0; if (td->td_ucred != p->p_ucred) cred_update_thread(td); if (p->p_flag & P_TRACED) { traced = 1; PROC_LOCK(p); td->td_dbgflags &= ~TDB_USERWR; td->td_dbgflags |= TDB_SCE; PROC_UNLOCK(p); } else traced = 0; error = (p->p_sysent->sv_fetch_syscall_args)(td, sa); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(sa->code, sa->narg, sa->args); #endif CTR6(KTR_SYSC, "syscall: td=%p pid %d %s (%#lx, %#lx, %#lx)", td, td->td_proc->p_pid, syscallname(p, sa->code), sa->args[0], sa->args[1], sa->args[2]); if (error == 0) { STOPEVENT(p, S_SCE, sa->narg); if (p->p_flag & P_TRACED && p->p_stops & S_PT_SCE) { PROC_LOCK(p); ptracestop((td), SIGTRAP); PROC_UNLOCK(p); } if (td->td_dbgflags & TDB_USERWR) { /* * Reread syscall number and arguments if * debugger modified registers or memory. */ error = (p->p_sysent->sv_fetch_syscall_args)(td, sa); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(sa->code, sa->narg, sa->args); #endif if (error != 0) goto retval; } #ifdef CAPABILITY_MODE /* * In capability mode, we only allow access to system calls * flagged with SYF_CAPENABLED. */ if (IN_CAPABILITY_MODE(td) && !(sa->callp->sy_flags & SYF_CAPENABLED)) { error = ECAPMODE; goto retval; } #endif error = syscall_thread_enter(td, sa->callp); if (error != 0) goto retval; #ifdef KDTRACE_HOOKS /* * If the systrace module has registered it's probe * callback and if there is a probe active for the * syscall 'entry', process the probe. */ if (systrace_probe_func != NULL && sa->callp->sy_entry != 0) (*systrace_probe_func)(sa->callp->sy_entry, sa->code, sa->callp, sa->args, 0); #endif AUDIT_SYSCALL_ENTER(sa->code, td); error = (sa->callp->sy_call)(td, sa->args); AUDIT_SYSCALL_EXIT(error, td); /* Save the latest error return value. */ td->td_errno = error; #ifdef KDTRACE_HOOKS /* * If the systrace module has registered it's probe * callback and if there is a probe active for the * syscall 'return', process the probe. */ if (systrace_probe_func != NULL && sa->callp->sy_return != 0) (*systrace_probe_func)(sa->callp->sy_return, sa->code, sa->callp, NULL, (error) ? -1 : td->td_retval[0]); #endif syscall_thread_exit(td, sa->callp); CTR4(KTR_SYSC, "syscall: p=%p error=%d return %#lx %#lx", p, error, td->td_retval[0], td->td_retval[1]); } retval: if (traced) { PROC_LOCK(p); td->td_dbgflags &= ~TDB_SCE; PROC_UNLOCK(p); } (p->p_sysent->sv_set_syscall_retval)(td, error); return (error); }
/* * void prefetch_abort_handler(trapframe_t *tf) * * Abort handler called when instruction execution occurs at * a non existent or restricted (access permissions) memory page. * If the address is invalid and we were in SVC mode then panic as * the kernel should never prefetch abort. * If the address is invalid and the page is mapped then the user process * does no have read permission so send it a signal. * Otherwise fault the page in and try again. */ void prefetch_abort_handler(trapframe_t *tf) { struct thread *td; struct proc * p; struct vm_map *map; vm_offset_t fault_pc, va; int error = 0; struct ksig ksig; #if 0 /* Update vmmeter statistics */ uvmexp.traps++; #endif #if 0 printf("prefetch abort handler: %p %p\n", (void*)tf->tf_pc, (void*)tf->tf_usr_lr); #endif td = curthread; p = td->td_proc; PCPU_INC(cnt.v_trap); if (TRAP_USERMODE(tf)) { td->td_frame = tf; if (td->td_ucred != td->td_proc->p_ucred) cred_update_thread(td); } fault_pc = tf->tf_pc; if (td->td_md.md_spinlock_count == 0) { if (__predict_true(tf->tf_spsr & I32_bit) == 0) enable_interrupts(I32_bit); if (__predict_true(tf->tf_spsr & F32_bit) == 0) enable_interrupts(F32_bit); } /* See if the cpu state needs to be fixed up */ switch (prefetch_abort_fixup(tf, &ksig)) { case ABORT_FIXUP_RETURN: return; case ABORT_FIXUP_FAILED: /* Deliver a SIGILL to the process */ ksig.signb = SIGILL; ksig.code = 0; td->td_frame = tf; goto do_trapsignal; default: break; } /* Prefetch aborts cannot happen in kernel mode */ if (__predict_false(!TRAP_USERMODE(tf))) dab_fatal(tf, 0, tf->tf_pc, NULL, &ksig); td->td_pticks = 0; /* Ok validate the address, can only execute in USER space */ if (__predict_false(fault_pc >= VM_MAXUSER_ADDRESS || (fault_pc < VM_MIN_ADDRESS && vector_page == ARM_VECTORS_LOW))) { ksig.signb = SIGSEGV; ksig.code = 0; goto do_trapsignal; } map = &td->td_proc->p_vmspace->vm_map; va = trunc_page(fault_pc); /* * See if the pmap can handle this fault on its own... */ #ifdef DEBUG last_fault_code = -1; #endif if (pmap_fault_fixup(map->pmap, va, VM_PROT_READ, 1)) goto out; if (map != kernel_map) { PROC_LOCK(p); p->p_lock++; PROC_UNLOCK(p); } error = vm_fault(map, va, VM_PROT_READ | VM_PROT_EXECUTE, VM_FAULT_NORMAL); if (map != kernel_map) { PROC_LOCK(p); p->p_lock--; PROC_UNLOCK(p); } if (__predict_true(error == 0)) goto out; if (error == ENOMEM) { printf("VM: pid %d (%s), uid %d killed: " "out of swap\n", td->td_proc->p_pid, td->td_name, (td->td_proc->p_ucred) ? td->td_proc->p_ucred->cr_uid : -1); ksig.signb = SIGKILL; } else { ksig.signb = SIGSEGV; } ksig.code = 0; do_trapsignal: call_trapsignal(td, ksig.signb, ksig.code); out: userret(td, tf); }
void trap(struct trapframe *frame) { #ifdef KDTRACE_HOOKS struct reg regs; #endif struct thread *td = curthread; struct proc *p = td->td_proc; int i = 0, ucode = 0, code; u_int type; register_t addr = 0; vm_offset_t eva; ksiginfo_t ksi; #ifdef POWERFAIL_NMI static int lastalert = 0; #endif PCPU_INC(cnt.v_trap); type = frame->tf_trapno; #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI) { if (ipi_nmi_handler() == 0) goto out; } #endif /* SMP */ #ifdef KDB if (kdb_active) { kdb_reenter(); goto out; } #endif if (type == T_RESERVED) { trap_fatal(frame, 0); goto out; } #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI so we check for that first. * If the HWPMC module is active, 'pmc_hook' will point to * the function to be called. A return value of '1' from the * hook means that the NMI was handled by it and that we can * return immediately. */ if (type == T_NMI && pmc_intr && (*pmc_intr)(PCPU_GET(cpuid), frame)) goto out; #endif if (type == T_MCHK) { mca_intr(); goto out; } #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. */ if ((type == T_PROTFLT || type == T_PAGEFLT) && dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type)) goto out; #endif if ((frame->tf_eflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (ISPL(frame->tf_cs) == SEL_UPL || (frame->tf_eflags & PSL_VM)) uprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type); else if (type != T_NMI && type != T_BPTFLT && type != T_TRCTRAP && frame->tf_eip != (int)cpu_switch_load_gs) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * Page faults need interrupts disabled until later, * and we shouldn't enable interrupts while holding * a spin lock. */ if (type != T_PAGEFLT && td->td_md.md_spinlock_count == 0) enable_intr(); } } eva = 0; code = frame->tf_err; if (type == T_PAGEFLT) { /* * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We * are finally ready to read %cr2 and conditionally * reenable interrupts. If we hold a spin lock, then * we must not reenable interrupts. This might be a * spurious page fault. */ eva = rcr2(); if (td->td_md.md_spinlock_count == 0) enable_intr(); } if ((ISPL(frame->tf_cs) == SEL_UPL) || ((frame->tf_eflags & PSL_VM) && !(curpcb->pcb_flags & PCB_VM86CALL))) { /* user trap */ td->td_pticks = 0; td->td_frame = frame; addr = frame->tf_eip; if (td->td_ucred != p->p_ucred) cred_update_thread(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ i = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ enable_intr(); #ifdef KDTRACE_HOOKS if (type == T_BPTFLT) { fill_frame_regs(frame, ®s); if (dtrace_pid_probe_ptr != NULL && dtrace_pid_probe_ptr(®s) == 0) goto out; } #endif frame->tf_eflags &= ~PSL_T; i = SIGTRAP; ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); break; case T_ARITHTRAP: /* arithmetic trap */ #ifdef DEV_NPX ucode = npxtrap_x87(); if (ucode == -1) goto userout; #else ucode = 0; #endif i = SIGFPE; break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)frame); if (i == 0) goto user; break; } i = SIGBUS; ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR; break; case T_SEGNPFLT: /* segment not present fault */ i = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_ALIGNFLT: i = SIGBUS; ucode = BUS_ADRALN; break; case T_DOUBLEFLT: /* double fault */ default: i = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ i = trap_pfault(frame, TRUE, eva); #if defined(I586_CPU) && !defined(NO_F00F_HACK) if (i == -2) { /* * The f00f hack workaround has triggered, so * treat the fault as an illegal instruction * (T_PRIVINFLT) instead of a page fault. */ type = frame->tf_trapno = T_PRIVINFLT; /* Proceed as in that case. */ ucode = ILL_PRVOPC; i = SIGILL; break; } #endif if (i == -1) goto userout; if (i == 0) goto user; if (i == SIGSEGV) ucode = SEGV_MAPERR; else { if (prot_fault_translation == 0) { /* * Autodetect. * This check also covers the images * without the ABI-tag ELF note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && p->p_osrel >= P_OSREL_SIGSEGV) { i = SIGSEGV; ucode = SEGV_ACCERR; } else { i = SIGBUS; ucode = BUS_PAGE_FAULT; } } else if (prot_fault_translation == 1) { /* * Always compat mode. */ i = SIGBUS; ucode = BUS_PAGE_FAULT; } else { /* * Always SIGSEGV mode. */ i = SIGSEGV; ucode = SEGV_ACCERR; } } addr = eva; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(880, hz); lastalert = time_second; } goto userout; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef KDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (kdb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap(type, 0, frame); } #endif /* KDB */ goto userout; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: #ifdef DEV_NPX KASSERT(PCB_USER_FPU(td->td_pcb), ("kernel FPU ctx has leaked")); /* transparent fault (due to context switch "late") */ if (npxdna()) goto userout; #endif uprintf("pid %d killed due to lack of floating point\n", p->p_pid); i = SIGKILL; ucode = 0; break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ #if defined(DEV_NPX) && !defined(CPU_DISABLE_SSE) && defined(I686_CPU) ucode = npxtrap_sse(); if (ucode == -1) goto userout; #else ucode = 0; #endif i = SIGFPE; break; #ifdef KDTRACE_HOOKS case T_DTRACE_RET: enable_intr(); fill_frame_regs(frame, ®s); if (dtrace_return_probe_ptr != NULL && dtrace_return_probe_ptr(®s) == 0) goto out; break; #endif } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(frame, FALSE, eva); goto out; case T_DNA: #ifdef DEV_NPX KASSERT(!PCB_USER_FPU(td->td_pcb), ("Unregistered use of FPU in kernel")); if (npxdna()) goto out; #endif break; case T_ARITHTRAP: /* arithmetic trap */ case T_XMMFLT: /* SIMD floating-point exception */ case T_FPOPFLT: /* FPU operand fetch fault */ /* * XXXKIB for now disable any FPU traps in kernel * handler registration seems to be overkill */ trap_fatal(frame, 0); goto out; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)frame); if (i != 0) /* * returns to original process */ vm86_trap((struct vm86frame *)frame); goto out; } if (type == T_STKFLT) break; /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ if (curpcb->pcb_flags & PCB_VM86CALL) break; /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the * underlying LDT entry. This causes a fault * in kernel mode when the kernel attempts to * switch contexts. Lose the bad context * (XXX) so that we can continue, and generate * a signal. */ if (frame->tf_eip == (int)cpu_switch_load_gs) { curpcb->pcb_gs = 0; #if 0 PROC_LOCK(p); kern_psignal(p, SIGBUS); PROC_UNLOCK(p); #endif goto out; } if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame->tf_eip == (int)doreti_iret) { frame->tf_eip = (int)doreti_iret_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_ds) { frame->tf_eip = (int)doreti_popl_ds_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_es) { frame->tf_eip = (int)doreti_popl_es_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_fs) { frame->tf_eip = (int)doreti_popl_fs_fault; goto out; } if (curpcb->pcb_onfault != NULL) { frame->tf_eip = (int)curpcb->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_eflags & PSL_NT) { frame->tf_eflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ goto out; } if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame->tf_eflags &= ~PSL_T; goto out; } /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap() && !(curpcb->pcb_flags & PCB_VM86CALL)) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & 0xfffffff0); goto out; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB if (kdb_trap(type, 0, frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(880, hz); lastalert = time_second; } goto out; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef KDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (kdb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap(type, 0, frame); } #endif /* KDB */ goto out; } else if (panic_on_nmi == 0) goto out; /* FALLTHROUGH */ #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ } trap_fatal(frame, eva); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = i; ksi.ksi_code = ucode; ksi.ksi_addr = (void *)addr; ksi.ksi_trapno = type; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d err %x code %d type %d " "addr 0x%x esp 0x%08x eip 0x%08x " "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr, frame->tf_esp, frame->tf_eip, fubyte((void *)(frame->tf_eip + 0)), fubyte((void *)(frame->tf_eip + 1)), fubyte((void *)(frame->tf_eip + 2)), fubyte((void *)(frame->tf_eip + 3)), fubyte((void *)(frame->tf_eip + 4)), fubyte((void *)(frame->tf_eip + 5)), fubyte((void *)(frame->tf_eip + 6)), fubyte((void *)(frame->tf_eip + 7))); } KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", (u_long)eva); uprintf("\n"); } #endif user: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), ("Return from trap with kernel FPU ctx leaked")); userout: out: return; }
int pmclog_configure_log(struct pmc_mdep *md, struct pmc_owner *po, int logfd) { int error; struct proc *p; /* * As long as it is possible to get a LOR between pmc_sx lock and * proctree/allproc sx locks used for adding a new process, assure * the former is not held here. */ sx_assert(&pmc_sx, SA_UNLOCKED); PMCDBG(LOG,CFG,1, "config po=%p logfd=%d", po, logfd); p = po->po_owner; /* return EBUSY if a log file was already present */ if (po->po_flags & PMC_PO_OWNS_LOGFILE) return (EBUSY); KASSERT(po->po_kthread == NULL, ("[pmclog,%d] po=%p kthread (%p) already present", __LINE__, po, po->po_kthread)); KASSERT(po->po_file == NULL, ("[pmclog,%d] po=%p file (%p) already present", __LINE__, po, po->po_file)); /* get a reference to the file state */ error = fget_write(curthread, logfd, CAP_WRITE, &po->po_file); if (error) goto error; /* mark process as owning a log file */ po->po_flags |= PMC_PO_OWNS_LOGFILE; error = kproc_create(pmclog_loop, po, &po->po_kthread, RFHIGHPID, 0, "hwpmc: proc(%d)", p->p_pid); if (error) goto error; /* mark process as using HWPMCs */ PROC_LOCK(p); p->p_flag |= P_HWPMC; PROC_UNLOCK(p); /* create a log initialization entry */ PMCLOG_RESERVE_WITH_ERROR(po, INITIALIZE, sizeof(struct pmclog_initialize)); PMCLOG_EMIT32(PMC_VERSION); PMCLOG_EMIT32(md->pmd_cputype); PMCLOG_DESPATCH(po); return (0); error: /* shutdown the thread */ if (po->po_kthread) pmclog_stop_kthread(po); KASSERT(po->po_kthread == NULL, ("[pmclog,%d] po=%p kthread not " "stopped", __LINE__, po)); if (po->po_file) (void) fdrop(po->po_file, curthread); po->po_file = NULL; /* clear file and error state */ po->po_error = 0; return (error); }
static void linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_rt_sigframe *fp, frame; int oonstack; int sig; int code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(ARGS(rt_sendsig, "%p, %d, %p, %u"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe)); } else fp = (struct l_rt_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) if (sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; frame.sf_siginfo = PTROUT(&fp->sf_si); frame.sf_ucontext = PTROUT(&fp->sf_sc); /* Fill in POSIX parts */ ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig); /* * Build the signal context to be used by sigreturn. */ frame.sf_sc.uc_flags = 0; /* XXX ??? */ frame.sf_sc.uc_link = 0; /* XXX ??? */ frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; PROC_UNLOCK(p); bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask); frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0]; frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi; frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi; frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp; frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx; frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx; frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx; frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax; frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip; frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs; frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs; frame.sf_sc.uc_mcontext.sc_es = regs->tf_es; frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds; frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags; frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss; frame.sf_sc.uc_mcontext.sc_err = regs->tf_err; frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"), frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp, td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask); #endif if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ #ifdef DEBUG if (ldebug(rt_sendsig)) printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"), fp, oonstack); #endif PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = p->p_sysent->sv_sigcode_base + linux_sznonrtsigcode; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); }
void debugger_getprocs_callback(struct allocation_t* ref) { if (!ref) return; struct message_t* message = __get(ref); if (!message) return; // Only handle requests if (message->header.request != 1) goto cleanup; int(*_sx_slock)(struct sx *sx, int opts, const char *file, int line) = kdlsym(_sx_slock); void(*_sx_sunlock)(struct sx *sx, const char *file, int line) = kdlsym(_sx_sunlock); void(*_mtx_lock_flags)(struct mtx *m, int opts, const char *file, int line) = kdlsym(_mtx_lock_flags); void(*_mtx_unlock_flags)(struct mtx *m, int opts, const char *file, int line) = kdlsym(_mtx_unlock_flags); struct sx* allproclock = (struct sx*)kdlsym(allproc_lock); struct proclist* allproc = (struct proclist*)*(uint64_t*)kdlsym(allproc); void(*vmspace_free)(struct vmspace *) = kdlsym(vmspace_free); struct vmspace* (*vmspace_acquire_ref)(struct proc *) = kdlsym(vmspace_acquire_ref); void(*_vm_map_lock_read)(vm_map_t map, const char *file, int line) = kdlsym(_vm_map_lock_read); void(*_vm_map_unlock_read)(vm_map_t map, const char *file, int line) = kdlsym(_vm_map_unlock_read); uint64_t procCount = 0; struct proc* p = NULL; struct debugger_getprocs_t getproc = { 0 }; sx_slock(allproclock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); // Zero out our process information kmemset(&getproc, 0, sizeof(getproc)); // Get the vm map struct vmspace* vm = vmspace_acquire_ref(p); vm_map_t map = &p->p_vmspace->vm_map; vm_map_lock_read(map); struct vm_map_entry* entry = map->header.next; // Copy over all of the address information getproc.process_id = p->p_pid; getproc.text_address = (uint64_t)entry->start; getproc.text_size = (uint64_t)entry->end - entry->start; getproc.data_address = (uint64_t)p->p_vmspace->vm_daddr; getproc.data_size = p->p_vmspace->vm_dsize; // Copy over the name and path kmemcpy(getproc.process_name, p->p_comm, sizeof(getproc.process_name)); kmemcpy(getproc.path, p->p_elfpath, sizeof(getproc.path)); // Write it back to the PC kwrite(message->socket, &getproc, sizeof(getproc)); procCount++; // Free the vmmap vm_map_unlock_read(map); vmspace_free(vm); PROC_UNLOCK(p); } sx_sunlock(allproclock); // Send finalizer, because f**k this shit kmemset(&getproc, 0xDD, sizeof(getproc)); kwrite(message->socket, &getproc, sizeof(getproc)); cleanup: __dec(ref); }
/* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * in u. to call routine, followed by kcall * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct thread *td = curthread; struct proc *p = td->td_proc; struct sigacts *psp; struct trapframe *regs; struct l_sigframe *fp, frame; l_sigset_t lmask; int oonstack, i; int sig, code; sig = ksi->ksi_signo; code = ksi->ksi_code; PROC_LOCK_ASSERT(p, MA_OWNED); psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ linux_rt_sendsig(catcher, ksi, mask); return; } regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); #ifdef DEBUG if (ldebug(sendsig)) printf(ARGS(sendsig, "%p, %d, %p, %u"), catcher, sig, (void*)mask, code); #endif /* * Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct l_sigframe *)(td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct l_sigframe)); } else fp = (struct l_sigframe *)regs->tf_rsp - 1; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Build the argument list for the signal handler. */ if (p->p_sysent->sv_sigtbl) if (sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; bzero(&frame, sizeof(frame)); frame.sf_handler = PTROUT(catcher); frame.sf_sig = sig; bsd_to_linux_sigset(mask, &lmask); /* * Build the signal context to be used by sigreturn. */ frame.sf_sc.sc_mask = lmask.__bits[0]; frame.sf_sc.sc_gs = regs->tf_gs; frame.sf_sc.sc_fs = regs->tf_fs; frame.sf_sc.sc_es = regs->tf_es; frame.sf_sc.sc_ds = regs->tf_ds; frame.sf_sc.sc_edi = regs->tf_rdi; frame.sf_sc.sc_esi = regs->tf_rsi; frame.sf_sc.sc_ebp = regs->tf_rbp; frame.sf_sc.sc_ebx = regs->tf_rbx; frame.sf_sc.sc_edx = regs->tf_rdx; frame.sf_sc.sc_ecx = regs->tf_rcx; frame.sf_sc.sc_eax = regs->tf_rax; frame.sf_sc.sc_eip = regs->tf_rip; frame.sf_sc.sc_cs = regs->tf_cs; frame.sf_sc.sc_eflags = regs->tf_rflags; frame.sf_sc.sc_esp_at_signal = regs->tf_rsp; frame.sf_sc.sc_ss = regs->tf_ss; frame.sf_sc.sc_err = regs->tf_err; frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr; frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code); for (i = 0; i < (LINUX_NSIG_WORDS-1); i++) frame.sf_extramask[i] = lmask.__bits[i+1]; if (copyout(&frame, fp, sizeof(frame)) != 0) { /* * Process has trashed its stack; give it an illegal * instruction to halt it in its tracks. */ PROC_LOCK(p); sigexit(td, SIGILL); } /* * Build context to run handler in. */ regs->tf_rsp = PTROUT(fp); regs->tf_rip = p->p_sysent->sv_sigcode_base; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucode32sel; regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); }
static int create_thread(struct thread *td, mcontext_t *ctx, void (*start_func)(void *), void *arg, char *stack_base, size_t stack_size, char *tls_base, long *child_tid, long *parent_tid, int flags, struct rtprio *rtp) { stack_t stack; struct thread *newtd; struct proc *p; int error; p = td->td_proc; /* Have race condition but it is cheap. */ if (p->p_numthreads >= max_threads_per_proc) { ++max_threads_hits; return (EPROCLIM); } if (rtp != NULL) { switch(rtp->type) { case RTP_PRIO_REALTIME: case RTP_PRIO_FIFO: /* Only root can set scheduler policy */ if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0) return (EPERM); if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); break; case RTP_PRIO_NORMAL: rtp->prio = 0; break; default: return (EINVAL); } } #ifdef RACCT PROC_LOCK(td->td_proc); error = racct_add(p, RACCT_NTHR, 1); PROC_UNLOCK(td->td_proc); if (error != 0) return (EPROCLIM); #endif /* Initialize our td */ newtd = thread_alloc(0); if (newtd == NULL) { error = ENOMEM; goto fail; } cpu_set_upcall(newtd, td); /* * Try the copyout as soon as we allocate the td so we don't * have to tear things down in a failure case below. * Here we copy out tid to two places, one for child and one * for parent, because pthread can create a detached thread, * if parent wants to safely access child tid, it has to provide * its storage, because child thread may exit quickly and * memory is freed before parent thread can access it. */ if ((child_tid != NULL && suword_lwpid(child_tid, newtd->td_tid)) || (parent_tid != NULL && suword_lwpid(parent_tid, newtd->td_tid))) { thread_free(newtd); error = EFAULT; goto fail; } bzero(&newtd->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &newtd->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); newtd->td_proc = td->td_proc; newtd->td_ucred = crhold(td->td_ucred); if (ctx != NULL) { /* old way to set user context */ error = set_mcontext(newtd, ctx); if (error != 0) { thread_free(newtd); crfree(td->td_ucred); goto fail; } } else { /* Set up our machine context. */ stack.ss_sp = stack_base; stack.ss_size = stack_size; /* Set upcall address to user thread entry function. */ cpu_set_upcall_kse(newtd, start_func, arg, &stack); /* Setup user TLS address and TLS pointer register. */ error = cpu_set_user_tls(newtd, tls_base); if (error != 0) { thread_free(newtd); crfree(td->td_ucred); goto fail; } } PROC_LOCK(td->td_proc); td->td_proc->p_flag |= P_HADTHREADS; thread_link(newtd, p); bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name)); thread_lock(td); /* let the scheduler know about these things. */ sched_fork_thread(td, newtd); thread_unlock(td); if (P_SHOULDSTOP(p)) newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; PROC_UNLOCK(p); tidhash_add(newtd); thread_lock(newtd); if (rtp != NULL) { if (!(td->td_pri_class == PRI_TIMESHARE && rtp->type == RTP_PRIO_NORMAL)) { rtp_to_pri(rtp, newtd); sched_prio(newtd, newtd->td_user_pri); } /* ignore timesharing class */ } TD_SET_CAN_RUN(newtd); sched_add(newtd, SRQ_BORING); thread_unlock(newtd); return (0); fail: #ifdef RACCT PROC_LOCK(p); racct_sub(p, RACCT_NTHR, 1); PROC_UNLOCK(p); #endif return (error); }
static void do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread *td2, struct vmspace *vm2, struct file *fp_procdesc) { struct proc *p1, *pptr; int trypid; struct filedesc *fd; struct filedesc_to_leader *fdtol; struct sigacts *newsigacts; sx_assert(&proctree_lock, SX_SLOCKED); sx_assert(&allproc_lock, SX_XLOCKED); p1 = td->td_proc; trypid = fork_findpid(fr->fr_flags); sx_sunlock(&proctree_lock); p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = trypid; AUDIT_ARG_PID(p2->p_pid); LIST_INSERT_HEAD(&allproc, p2, p_list); allproc_gen++; LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); tidhash_add(td2); PROC_LOCK(p2); PROC_LOCK(p1); sx_xunlock(&allproc_lock); bcopy(&p1->p_startcopy, &p2->p_startcopy, __rangeof(struct proc, p_startcopy, p_endcopy)); pargs_hold(p2->p_args); PROC_UNLOCK(p1); bzero(&p2->p_startzero, __rangeof(struct proc, p_startzero, p_endzero)); /* Tell the prison that we exist. */ prison_proc_hold(p2->p_ucred->cr_prison); PROC_UNLOCK(p2); /* * Malloc things while we don't hold any locks. */ if (fr->fr_flags & RFSIGSHARE) newsigacts = NULL; else newsigacts = sigacts_alloc(); /* * Copy filedesc. */ if (fr->fr_flags & RFCFDG) { fd = fdinit(p1->p_fd, false); fdtol = NULL; } else if (fr->fr_flags & RFFDG) { fd = fdcopy(p1->p_fd); fdtol = NULL; } else { fd = fdshare(p1->p_fd); if (p1->p_fdtol == NULL) p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL, p1->p_leader); if ((fr->fr_flags & RFTHREAD) != 0) { /* * Shared file descriptor table, and shared * process leaders. */ fdtol = p1->p_fdtol; FILEDESC_XLOCK(p1->p_fd); fdtol->fdl_refcount++; FILEDESC_XUNLOCK(p1->p_fd); } else { /* * Shared file descriptor table, and different * process leaders. */ fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p1->p_fd, p2); } } /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ PROC_LOCK(p2); PROC_LOCK(p1); bzero(&td2->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &td2->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name)); td2->td_sigstk = td->td_sigstk; td2->td_flags = TDF_INMEM; td2->td_lend_user_pri = PRI_MAX; #ifdef VIMAGE td2->td_vnet = NULL; td2->td_vnet_lpush = NULL; #endif /* * Allow the scheduler to initialize the child. */ thread_lock(td); sched_fork(td, td2); thread_unlock(td); /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. */ p2->p_flag = P_INMEM; p2->p_flag2 = p1->p_flag2 & (P2_NOTRACE | P2_NOTRACE_EXEC | P2_TRAPCAP); p2->p_swtick = ticks; if (p1->p_flag & P_PROFIL) startprofclock(p2); /* * Whilst the proc lock is held, copy the VM domain data out * using the VM domain method. */ vm_domain_policy_init(&p2->p_vm_dom_policy); vm_domain_policy_localcopy(&p2->p_vm_dom_policy, &p1->p_vm_dom_policy); if (fr->fr_flags & RFSIGSHARE) { p2->p_sigacts = sigacts_hold(p1->p_sigacts); } else { sigacts_copy(newsigacts, p1->p_sigacts); p2->p_sigacts = newsigacts; } if (fr->fr_flags & RFTSIGZMB) p2->p_sigparent = RFTSIGNUM(fr->fr_flags); else if (fr->fr_flags & RFLINUXTHPN) p2->p_sigparent = SIGUSR1; else p2->p_sigparent = SIGCHLD; p2->p_textvp = p1->p_textvp; p2->p_fd = fd; p2->p_fdtol = fdtol; if (p1->p_flag2 & P2_INHERIT_PROTECTED) { p2->p_flag |= P_PROTECTED; p2->p_flag2 |= P2_INHERIT_PROTECTED; } /* * p_limit is copy-on-write. Bump its refcount. */ lim_fork(p1, p2); thread_cow_get_proc(td2, p2); pstats_fork(p1->p_stats, p2->p_stats); PROC_UNLOCK(p1); PROC_UNLOCK(p2); /* Bump references to the text vnode (for procfs). */ if (p2->p_textvp) vrefact(p2->p_textvp); /* * Set up linkage for kernel based threading. */ if ((fr->fr_flags & RFTHREAD) != 0) { mtx_lock(&ppeers_lock); p2->p_peers = p1->p_peers; p1->p_peers = p2; p2->p_leader = p1->p_leader; mtx_unlock(&ppeers_lock); PROC_LOCK(p1->p_leader); if ((p1->p_leader->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(p1->p_leader); /* * The task leader is exiting, so process p1 is * going to be killed shortly. Since p1 obviously * isn't dead yet, we know that the leader is either * sending SIGKILL's to all the processes in this * task or is sleeping waiting for all the peers to * exit. We let p1 complete the fork, but we need * to go ahead and kill the new process p2 since * the task leader may not get a chance to send * SIGKILL to it. We leave it on the list so that * the task leader will wait for this new process * to commit suicide. */ PROC_LOCK(p2); kern_psignal(p2, SIGKILL); PROC_UNLOCK(p2); } else PROC_UNLOCK(p1->p_leader); } else { p2->p_peers = NULL; p2->p_leader = p2; } sx_xlock(&proctree_lock); PGRP_LOCK(p1->p_pgrp); PROC_LOCK(p2); PROC_LOCK(p1); /* * Preserve some more flags in subprocess. P_PROFIL has already * been preserved. */ p2->p_flag |= p1->p_flag & P_SUGID; td2->td_pflags |= (td->td_pflags & TDP_ALTSTACK) | TDP_FORKING; SESS_LOCK(p1->p_session); if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) p2->p_flag |= P_CONTROLT; SESS_UNLOCK(p1->p_session); if (fr->fr_flags & RFPPWAIT) p2->p_flag |= P_PPWAIT; p2->p_pgrp = p1->p_pgrp; LIST_INSERT_AFTER(p1, p2, p_pglist); PGRP_UNLOCK(p1->p_pgrp); LIST_INIT(&p2->p_children); LIST_INIT(&p2->p_orphans); callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0); /* * If PF_FORK is set, the child process inherits the * procfs ioctl flags from its parent. */ if (p1->p_pfsflags & PF_FORK) { p2->p_stops = p1->p_stops; p2->p_pfsflags = p1->p_pfsflags; } /* * This begins the section where we must prevent the parent * from being swapped. */ _PHOLD(p1); PROC_UNLOCK(p1); /* * Attach the new process to its parent. * * If RFNOWAIT is set, the newly created process becomes a child * of init. This effectively disassociates the child from the * parent. */ if ((fr->fr_flags & RFNOWAIT) != 0) { pptr = p1->p_reaper; p2->p_reaper = pptr; } else { p2->p_reaper = (p1->p_treeflag & P_TREE_REAPER) != 0 ? p1 : p1->p_reaper; pptr = p1; } p2->p_pptr = pptr; LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); LIST_INIT(&p2->p_reaplist); LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling); if (p2->p_reaper == p1) p2->p_reapsubtree = p2->p_pid; sx_xunlock(&proctree_lock); /* Inform accounting that we have forked. */ p2->p_acflag = AFORK; PROC_UNLOCK(p2); #ifdef KTRACE ktrprocfork(p1, p2); #endif /* * Finish creating the child process. It will return via a different * execution path later. (ie: directly into user mode) */ vm_forkproc(td, p2, td2, vm2, fr->fr_flags); if (fr->fr_flags == (RFFDG | RFPROC)) { VM_CNT_INC(v_forks); VM_CNT_ADD(v_forkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (fr->fr_flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { VM_CNT_INC(v_vforks); VM_CNT_ADD(v_vforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (p1 == &proc0) { VM_CNT_INC(v_kthreads); VM_CNT_ADD(v_kthreadpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else { VM_CNT_INC(v_rforks); VM_CNT_ADD(v_rforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } /* * Associate the process descriptor with the process before anything * can happen that might cause that process to need the descriptor. * However, don't do this until after fork(2) can no longer fail. */ if (fr->fr_flags & RFPROCDESC) procdesc_new(p2, fr->fr_pd_flags); /* * Both processes are set up, now check if any loadable modules want * to adjust anything. */ EVENTHANDLER_INVOKE(process_fork, p1, p2, fr->fr_flags); /* * Set the child start time and mark the process as being complete. */ PROC_LOCK(p2); PROC_LOCK(p1); microuptime(&p2->p_stats->p_start); PROC_SLOCK(p2); p2->p_state = PRS_NORMAL; PROC_SUNLOCK(p2); #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the new process so that any * tracepoints inherited from the parent can be removed. We have to do * this only after p_state is PRS_NORMAL since the fasttrap module will * use pfind() later on. */ if ((fr->fr_flags & RFMEM) == 0 && dtrace_fasttrap_fork) dtrace_fasttrap_fork(p1, p2); #endif /* * Hold the process so that it cannot exit after we make it runnable, * but before we wait for the debugger. */ _PHOLD(p2); if (p1->p_ptevents & PTRACE_FORK) { /* * Arrange for debugger to receive the fork event. * * We can report PL_FLAG_FORKED regardless of * P_FOLLOWFORK settings, but it does not make a sense * for runaway child. */ td->td_dbgflags |= TDB_FORK; td->td_dbg_forked = p2->p_pid; td2->td_dbgflags |= TDB_STOPATFORK; } if (fr->fr_flags & RFPPWAIT) { td->td_pflags |= TDP_RFPPWAIT; td->td_rfppwait_p = p2; td->td_dbgflags |= TDB_VFORK; } PROC_UNLOCK(p2); /* * Now can be swapped. */ _PRELE(p1); PROC_UNLOCK(p1); /* * Tell any interested parties about the new process. */ knote_fork(p1->p_klist, p2->p_pid); SDT_PROBE3(proc, , , create, p2, p1, fr->fr_flags); if (fr->fr_flags & RFPROCDESC) { procdesc_finit(p2->p_procdesc, fp_procdesc); fdrop(fp_procdesc, td); } if ((fr->fr_flags & RFSTOPPED) == 0) { /* * If RFSTOPPED not requested, make child runnable and * add to run queue. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); if (fr->fr_pidp != NULL) *fr->fr_pidp = p2->p_pid; } else { *fr->fr_procp = p2; } PROC_LOCK(p2); /* * Wait until debugger is attached to child. */ while (td2->td_proc == p2 && (td2->td_dbgflags & TDB_STOPATFORK) != 0) cv_wait(&p2->p_dbgwait, &p2->p_mtx); _PRELE(p2); racct_proc_fork_done(p2); PROC_UNLOCK(p2); }