/* * Function: unix_syscall * * Inputs: regs - pointer to i386 save area * * Outputs: none */ void unix_syscall(x86_saved_state_t *state) { thread_t thread; void *vt; unsigned int code; struct sysent *callp; int error; vm_offset_t params; struct proc *p; struct uthread *uthread; x86_saved_state32_t *regs; boolean_t is_vfork; assert(is_saved_state32(state)); regs = saved_state32(state); #if DEBUG if (regs->eax == 0x800) thread_exception_return(); #endif thread = current_thread(); uthread = get_bsdthread_info(thread); /* Get the approriate proc; may be different from task's for vfork() */ is_vfork = uthread->uu_flag & UT_VFORK; if (__improbable(is_vfork != 0)) p = current_proc(); else p = (struct proc *)get_bsdtask_info(current_task()); /* Verify that we are not being called from a task without a proc */ if (__improbable(p == NULL)) { regs->eax = EPERM; regs->efl |= EFL_CF; task_terminate_internal(current_task()); thread_exception_return(); /* NOTREACHED */ } code = regs->eax & I386_SYSCALL_NUMBER_MASK; DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n", code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip); params = (vm_offset_t) (regs->uesp + sizeof (int)); regs->efl &= ~(EFL_CF); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; if (__improbable(callp == sysent)) { code = fuword(params); params += sizeof(int); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; } vt = (void *)uthread->uu_arg; if (callp->sy_arg_bytes != 0) { #if CONFIG_REQUIRES_U32_MUNGING sy_munge_t *mungerp; #else #error U32 syscalls on x86_64 kernel requires munging #endif uint32_t nargs; assert((unsigned) callp->sy_arg_bytes <= sizeof (uthread->uu_arg)); nargs = callp->sy_arg_bytes; error = copyin((user_addr_t) params, (char *) vt, nargs); if (error) { regs->eax = error; regs->efl |= EFL_CF; thread_exception_return(); /* NOTREACHED */ } if (__probable(code != 180)) { int *ip = (int *)vt; KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, *ip, *(ip+1), *(ip+2), *(ip+3), 0); } #if CONFIG_REQUIRES_U32_MUNGING mungerp = callp->sy_arg_munge32; if (mungerp != NULL) (*mungerp)(vt); #endif } else KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 0, 0, 0, 0, 0); /* * Delayed binding of thread credential to process credential, if we * are not running with an explicitly set thread credential. */ kauth_cred_uthread_update(uthread, p); uthread->uu_rval[0] = 0; uthread->uu_rval[1] = 0; uthread->uu_flag |= UT_NOTCANCELPT; uthread->syscall_code = code; #ifdef JOE_DEBUG uthread->uu_iocount = 0; uthread->uu_vpindex = 0; #endif AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, p, uthread, error); #ifdef JOE_DEBUG if (uthread->uu_iocount) printf("system call returned with uu_iocount != 0\n"); #endif #if CONFIG_DTRACE uthread->t_dtrace_errno = error; #endif /* CONFIG_DTRACE */ if (__improbable(error == ERESTART)) { /* * Move the user's pc back to repeat the syscall: * 5 bytes for a sysenter, or 2 for an int 8x. * The SYSENTER_TF_CS covers single-stepping over a sysenter * - see debug trap handler in idt.s/idt64.s */ pal_syscall_restart(thread, state); } else if (error != EJUSTRETURN) { if (__improbable(error)) { regs->eax = error; regs->efl |= EFL_CF; /* carry bit */ } else { /* (not error) */ /* * We split retval across two registers, in case the * syscall had a 64-bit return value, in which case * eax/edx matches the function call ABI. */ regs->eax = uthread->uu_rval[0]; regs->edx = uthread->uu_rval[1]; } } DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall: error=%d retval=(%u,%u)\n", error, regs->eax, regs->edx); uthread->uu_flag &= ~UT_NOTCANCELPT; if (__improbable(uthread->uu_lowpri_window)) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call * collided with normal I/O operations... we'll * delay in order to mitigate the impact of this * task on the normal operation of the system */ throttle_lowpri_io(1); } if (__probable(code != 180)) KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) { pal_execve_return(thread); } thread_exception_return(); /* NOTREACHED */ }
static inline int syscallenter(struct thread *td, struct syscall_args *sa) { struct proc *p; int error, traced; PCPU_INC(cnt.v_syscall); p = td->td_proc; td->td_pticks = 0; if (td->td_ucred != p->p_ucred) cred_update_thread(td); if (p->p_flag & P_TRACED) { traced = 1; PROC_LOCK(p); td->td_dbgflags &= ~TDB_USERWR; td->td_dbgflags |= TDB_SCE; PROC_UNLOCK(p); } else traced = 0; error = (p->p_sysent->sv_fetch_syscall_args)(td, sa); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(sa->code, sa->narg, sa->args); #endif CTR6(KTR_SYSC, "syscall: td=%p pid %d %s (%#lx, %#lx, %#lx)", td, td->td_proc->p_pid, syscallname(p, sa->code), sa->args[0], sa->args[1], sa->args[2]); if (error == 0) { STOPEVENT(p, S_SCE, sa->narg); if (p->p_flag & P_TRACED && p->p_stops & S_PT_SCE) { PROC_LOCK(p); ptracestop((td), SIGTRAP); PROC_UNLOCK(p); } if (td->td_dbgflags & TDB_USERWR) { /* * Reread syscall number and arguments if * debugger modified registers or memory. */ error = (p->p_sysent->sv_fetch_syscall_args)(td, sa); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(sa->code, sa->narg, sa->args); #endif if (error != 0) goto retval; } #ifdef CAPABILITY_MODE /* * In capability mode, we only allow access to system calls * flagged with SYF_CAPENABLED. */ if (IN_CAPABILITY_MODE(td) && !(sa->callp->sy_flags & SYF_CAPENABLED)) { error = ECAPMODE; goto retval; } #endif error = syscall_thread_enter(td, sa->callp); if (error != 0) goto retval; #ifdef KDTRACE_HOOKS /* * If the systrace module has registered it's probe * callback and if there is a probe active for the * syscall 'entry', process the probe. */ if (systrace_probe_func != NULL && sa->callp->sy_entry != 0) (*systrace_probe_func)(sa->callp->sy_entry, sa->code, sa->callp, sa->args, 0); #endif AUDIT_SYSCALL_ENTER(sa->code, td); error = (sa->callp->sy_call)(td, sa->args); AUDIT_SYSCALL_EXIT(error, td); /* Save the latest error return value. */ td->td_errno = error; #ifdef KDTRACE_HOOKS /* * If the systrace module has registered it's probe * callback and if there is a probe active for the * syscall 'return', process the probe. */ if (systrace_probe_func != NULL && sa->callp->sy_return != 0) (*systrace_probe_func)(sa->callp->sy_return, sa->code, sa->callp, NULL, (error) ? -1 : td->td_retval[0]); #endif syscall_thread_exit(td, sa->callp); CTR4(KTR_SYSC, "syscall: p=%p error=%d return %#lx %#lx", p, error, td->td_retval[0], td->td_retval[1]); } retval: if (traced) { PROC_LOCK(p); td->td_dbgflags &= ~TDB_SCE; PROC_UNLOCK(p); } (p->p_sysent->sv_set_syscall_retval)(td, error); return (error); }
void unix_syscall64(x86_saved_state_t *state) { thread_t thread; void *vt; unsigned int code; struct sysent *callp; int args_in_regs; boolean_t args_start_at_rdi; int error; struct proc *p; struct uthread *uthread; x86_saved_state64_t *regs; assert(is_saved_state64(state)); regs = saved_state64(state); #if DEBUG if (regs->rax == 0x2000800) thread_exception_return(); #endif thread = current_thread(); uthread = get_bsdthread_info(thread); /* Get the approriate proc; may be different from task's for vfork() */ if (__probable(!(uthread->uu_flag & UT_VFORK))) p = (struct proc *)get_bsdtask_info(current_task()); else p = current_proc(); /* Verify that we are not being called from a task without a proc */ if (__improbable(p == NULL)) { regs->rax = EPERM; regs->isf.rflags |= EFL_CF; task_terminate_internal(current_task()); thread_exception_return(); /* NOTREACHED */ } code = regs->rax & SYSCALL_NUMBER_MASK; DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall64: code=%d(%s) rip=%llx\n", code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; vt = (void *)uthread->uu_arg; if (__improbable(callp == sysent)) { /* * indirect system call... system call number * passed as 'arg0' */ code = regs->rdi; callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; args_start_at_rdi = FALSE; args_in_regs = 5; } else { args_start_at_rdi = TRUE; args_in_regs = 6; } if (callp->sy_narg != 0) { assert(callp->sy_narg <= 8); /* size of uu_arg */ args_in_regs = MIN(args_in_regs, callp->sy_narg); memcpy(vt, args_start_at_rdi ? ®s->rdi : ®s->rsi, args_in_regs * sizeof(syscall_arg_t)); if (code != 180) { uint64_t *ip = (uint64_t *)vt; KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0); } if (__improbable(callp->sy_narg > args_in_regs)) { int copyin_count; copyin_count = (callp->sy_narg - args_in_regs) * sizeof(syscall_arg_t); error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&uthread->uu_arg[args_in_regs], copyin_count); if (error) { regs->rax = error; regs->isf.rflags |= EFL_CF; thread_exception_return(); /* NOTREACHED */ } } } else KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 0, 0, 0, 0, 0); /* * Delayed binding of thread credential to process credential, if we * are not running with an explicitly set thread credential. */ kauth_cred_uthread_update(uthread, p); uthread->uu_rval[0] = 0; uthread->uu_rval[1] = 0; uthread->uu_flag |= UT_NOTCANCELPT; uthread->syscall_code = code; #ifdef JOE_DEBUG uthread->uu_iocount = 0; uthread->uu_vpindex = 0; #endif AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, vt, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, p, uthread, error); #ifdef JOE_DEBUG if (uthread->uu_iocount) printf("system call returned with uu_iocount != 0\n"); #endif #if CONFIG_DTRACE uthread->t_dtrace_errno = error; #endif /* CONFIG_DTRACE */ if (__improbable(error == ERESTART)) { /* * all system calls come through via the syscall instruction * in 64 bit mode... its 2 bytes in length * move the user's pc back to repeat the syscall: */ pal_syscall_restart( thread, state ); } else if (error != EJUSTRETURN) { if (__improbable(error)) { regs->rax = error; regs->isf.rflags |= EFL_CF; /* carry bit */ } else { /* (not error) */ switch (callp->sy_return_type) { case _SYSCALL_RET_INT_T: regs->rax = uthread->uu_rval[0]; regs->rdx = uthread->uu_rval[1]; break; case _SYSCALL_RET_UINT_T: regs->rax = ((u_int)uthread->uu_rval[0]); regs->rdx = ((u_int)uthread->uu_rval[1]); break; case _SYSCALL_RET_OFF_T: case _SYSCALL_RET_ADDR_T: case _SYSCALL_RET_SIZE_T: case _SYSCALL_RET_SSIZE_T: case _SYSCALL_RET_UINT64_T: regs->rax = *((uint64_t *)(&uthread->uu_rval[0])); regs->rdx = 0; break; case _SYSCALL_RET_NONE: break; default: panic("unix_syscall: unknown return type"); break; } regs->isf.rflags &= ~EFL_CF; } } DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall64: error=%d retval=(%llu,%llu)\n", error, regs->rax, regs->rdx); uthread->uu_flag &= ~UT_NOTCANCELPT; if (__improbable(uthread->uu_lowpri_window)) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call * collided with normal I/O operations... we'll * delay in order to mitigate the impact of this * task on the normal operation of the system */ throttle_lowpri_io(1); } if (__probable(code != 180)) KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); thread_exception_return(); /* NOTREACHED */ }
void ia32_syscall(struct trapframe *frame) { caddr_t params; int i; struct sysent *callp; struct thread *td = curthread; struct proc *p = td->td_proc; register_t orig_tf_rflags; int error; int narg; u_int32_t args[8]; u_int64_t args64[8]; u_int code; ksiginfo_t ksi; PCPU_INC(cnt.v_syscall); td->td_pticks = 0; td->td_frame = frame; if (td->td_ucred != p->p_ucred) cred_update_thread(td); params = (caddr_t)frame->tf_rsp + sizeof(u_int32_t); code = frame->tf_rax; orig_tf_rflags = frame->tf_rflags; if (p->p_sysent->sv_prepsyscall) { /* * The prep code is MP aware. */ (*p->p_sysent->sv_prepsyscall)(frame, args, &code, ¶ms); } else { /* * Need to check if this is a 32 bit or 64 bit syscall. * fuword is MP aware. */ if (code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ code = fuword32(params); params += sizeof(int); } else if (code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. * We use a 32-bit fetch in case params is not * aligned. */ code = fuword32(params); params += sizeof(quad_t); } } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; narg = callp->sy_narg; /* * copyin and the ktrsyscall()/ktrsysret() code is MP-aware */ if (params != NULL && narg != 0) error = copyin(params, (caddr_t)args, (u_int)(narg * sizeof(int))); else error = 0; for (i = 0; i < narg; i++) args64[i] = args[i]; #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(code, narg, args64); #endif CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td, td->td_proc->p_pid, td->td_proc->p_comm, code); if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; STOPEVENT(p, S_SCE, narg); PTRACESTOP_SC(p, td, S_PT_SCE); AUDIT_SYSCALL_ENTER(code, td); error = (*callp->sy_call)(td, args64); AUDIT_SYSCALL_EXIT(error, td); } switch (error) { case 0: frame->tf_rax = td->td_retval[0]; frame->tf_rdx = td->td_retval[1]; frame->tf_rflags &= ~PSL_C; break; case ERESTART: /* * Reconstruct pc, assuming lcall $X,y is 7 bytes, * int 0x80 is 2 bytes. We saved this in tf_err. */ frame->tf_rip -= frame->tf_err; break; case EJUSTRETURN: break; default: if (p->p_sysent->sv_errsize) { if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; } frame->tf_rax = error; frame->tf_rflags |= PSL_C; break; } /* * Traced syscall. */ if (orig_tf_rflags & PSL_T) { frame->tf_rflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)frame->tf_rip; trapsignal(td, &ksi); } /* * Check for misbehavior. */ WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning", (code >= 0 && code < SYS_MAXSYSCALL) ? freebsd32_syscallnames[code] : "???"); KASSERT(td->td_critnest == 0, ("System call %s returning in a critical section", (code >= 0 && code < SYS_MAXSYSCALL) ? freebsd32_syscallnames[code] : "???")); KASSERT(td->td_locks == 0, ("System call %s returning with %d locks held", (code >= 0 && code < SYS_MAXSYSCALL) ? freebsd32_syscallnames[code] : "???", td->td_locks)); /* * Handle reschedule and other end-of-syscall issues */ userret(td, frame); CTR4(KTR_SYSC, "syscall exit thread %p pid %d proc %s code %d", td, td->td_proc->p_pid, td->td_proc->p_comm, code); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(code, error, td->td_retval[0]); #endif /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); PTRACESTOP_SC(p, td, S_PT_SCX); }