/* * Function: unix_syscall * * Inputs: regs - pointer to i386 save area * * Outputs: none */ void unix_syscall(x86_saved_state_t *state) { thread_t thread; void *vt; unsigned int code; struct sysent *callp; int error; vm_offset_t params; struct proc *p; struct uthread *uthread; x86_saved_state32_t *regs; boolean_t is_vfork; assert(is_saved_state32(state)); regs = saved_state32(state); #if DEBUG if (regs->eax == 0x800) thread_exception_return(); #endif thread = current_thread(); uthread = get_bsdthread_info(thread); /* Get the approriate proc; may be different from task's for vfork() */ is_vfork = uthread->uu_flag & UT_VFORK; if (__improbable(is_vfork != 0)) p = current_proc(); else p = (struct proc *)get_bsdtask_info(current_task()); /* Verify that we are not being called from a task without a proc */ if (__improbable(p == NULL)) { regs->eax = EPERM; regs->efl |= EFL_CF; task_terminate_internal(current_task()); thread_exception_return(); /* NOTREACHED */ } code = regs->eax & I386_SYSCALL_NUMBER_MASK; DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n", code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip); params = (vm_offset_t) (regs->uesp + sizeof (int)); regs->efl &= ~(EFL_CF); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; if (__improbable(callp == sysent)) { code = fuword(params); params += sizeof(int); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; } vt = (void *)uthread->uu_arg; if (callp->sy_arg_bytes != 0) { #if CONFIG_REQUIRES_U32_MUNGING sy_munge_t *mungerp; #else #error U32 syscalls on x86_64 kernel requires munging #endif uint32_t nargs; assert((unsigned) callp->sy_arg_bytes <= sizeof (uthread->uu_arg)); nargs = callp->sy_arg_bytes; error = copyin((user_addr_t) params, (char *) vt, nargs); if (error) { regs->eax = error; regs->efl |= EFL_CF; thread_exception_return(); /* NOTREACHED */ } if (__probable(code != 180)) { int *ip = (int *)vt; KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, *ip, *(ip+1), *(ip+2), *(ip+3), 0); } #if CONFIG_REQUIRES_U32_MUNGING mungerp = callp->sy_arg_munge32; if (mungerp != NULL) (*mungerp)(vt); #endif } else KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 0, 0, 0, 0, 0); /* * Delayed binding of thread credential to process credential, if we * are not running with an explicitly set thread credential. */ kauth_cred_uthread_update(uthread, p); uthread->uu_rval[0] = 0; uthread->uu_rval[1] = 0; uthread->uu_flag |= UT_NOTCANCELPT; uthread->syscall_code = code; #ifdef JOE_DEBUG uthread->uu_iocount = 0; uthread->uu_vpindex = 0; #endif AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, p, uthread, error); #ifdef JOE_DEBUG if (uthread->uu_iocount) printf("system call returned with uu_iocount != 0\n"); #endif #if CONFIG_DTRACE uthread->t_dtrace_errno = error; #endif /* CONFIG_DTRACE */ if (__improbable(error == ERESTART)) { /* * Move the user's pc back to repeat the syscall: * 5 bytes for a sysenter, or 2 for an int 8x. * The SYSENTER_TF_CS covers single-stepping over a sysenter * - see debug trap handler in idt.s/idt64.s */ pal_syscall_restart(thread, state); } else if (error != EJUSTRETURN) { if (__improbable(error)) { regs->eax = error; regs->efl |= EFL_CF; /* carry bit */ } else { /* (not error) */ /* * We split retval across two registers, in case the * syscall had a 64-bit return value, in which case * eax/edx matches the function call ABI. */ regs->eax = uthread->uu_rval[0]; regs->edx = uthread->uu_rval[1]; } } DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall: error=%d retval=(%u,%u)\n", error, regs->eax, regs->edx); uthread->uu_flag &= ~UT_NOTCANCELPT; if (__improbable(uthread->uu_lowpri_window)) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call * collided with normal I/O operations... we'll * delay in order to mitigate the impact of this * task on the normal operation of the system */ throttle_lowpri_io(1); } if (__probable(code != 180)) KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) { pal_execve_return(thread); } thread_exception_return(); /* NOTREACHED */ }
void unix_syscall64(x86_saved_state_t *state) { thread_t thread; void *vt; unsigned int code; struct sysent *callp; int args_in_regs; boolean_t args_start_at_rdi; int error; struct proc *p; struct uthread *uthread; x86_saved_state64_t *regs; assert(is_saved_state64(state)); regs = saved_state64(state); #if DEBUG if (regs->rax == 0x2000800) thread_exception_return(); #endif thread = current_thread(); uthread = get_bsdthread_info(thread); /* Get the approriate proc; may be different from task's for vfork() */ if (__probable(!(uthread->uu_flag & UT_VFORK))) p = (struct proc *)get_bsdtask_info(current_task()); else p = current_proc(); /* Verify that we are not being called from a task without a proc */ if (__improbable(p == NULL)) { regs->rax = EPERM; regs->isf.rflags |= EFL_CF; task_terminate_internal(current_task()); thread_exception_return(); /* NOTREACHED */ } code = regs->rax & SYSCALL_NUMBER_MASK; DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall64: code=%d(%s) rip=%llx\n", code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; vt = (void *)uthread->uu_arg; if (__improbable(callp == sysent)) { /* * indirect system call... system call number * passed as 'arg0' */ code = regs->rdi; callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; args_start_at_rdi = FALSE; args_in_regs = 5; } else { args_start_at_rdi = TRUE; args_in_regs = 6; } if (callp->sy_narg != 0) { assert(callp->sy_narg <= 8); /* size of uu_arg */ args_in_regs = MIN(args_in_regs, callp->sy_narg); memcpy(vt, args_start_at_rdi ? ®s->rdi : ®s->rsi, args_in_regs * sizeof(syscall_arg_t)); if (code != 180) { uint64_t *ip = (uint64_t *)vt; KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0); } if (__improbable(callp->sy_narg > args_in_regs)) { int copyin_count; copyin_count = (callp->sy_narg - args_in_regs) * sizeof(syscall_arg_t); error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&uthread->uu_arg[args_in_regs], copyin_count); if (error) { regs->rax = error; regs->isf.rflags |= EFL_CF; thread_exception_return(); /* NOTREACHED */ } } } else KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 0, 0, 0, 0, 0); /* * Delayed binding of thread credential to process credential, if we * are not running with an explicitly set thread credential. */ kauth_cred_uthread_update(uthread, p); uthread->uu_rval[0] = 0; uthread->uu_rval[1] = 0; uthread->uu_flag |= UT_NOTCANCELPT; uthread->syscall_code = code; #ifdef JOE_DEBUG uthread->uu_iocount = 0; uthread->uu_vpindex = 0; #endif AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, vt, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, p, uthread, error); #ifdef JOE_DEBUG if (uthread->uu_iocount) printf("system call returned with uu_iocount != 0\n"); #endif #if CONFIG_DTRACE uthread->t_dtrace_errno = error; #endif /* CONFIG_DTRACE */ if (__improbable(error == ERESTART)) { /* * all system calls come through via the syscall instruction * in 64 bit mode... its 2 bytes in length * move the user's pc back to repeat the syscall: */ pal_syscall_restart( thread, state ); } else if (error != EJUSTRETURN) { if (__improbable(error)) { regs->rax = error; regs->isf.rflags |= EFL_CF; /* carry bit */ } else { /* (not error) */ switch (callp->sy_return_type) { case _SYSCALL_RET_INT_T: regs->rax = uthread->uu_rval[0]; regs->rdx = uthread->uu_rval[1]; break; case _SYSCALL_RET_UINT_T: regs->rax = ((u_int)uthread->uu_rval[0]); regs->rdx = ((u_int)uthread->uu_rval[1]); break; case _SYSCALL_RET_OFF_T: case _SYSCALL_RET_ADDR_T: case _SYSCALL_RET_SIZE_T: case _SYSCALL_RET_SSIZE_T: case _SYSCALL_RET_UINT64_T: regs->rax = *((uint64_t *)(&uthread->uu_rval[0])); regs->rdx = 0; break; case _SYSCALL_RET_NONE: break; default: panic("unix_syscall: unknown return type"); break; } regs->isf.rflags &= ~EFL_CF; } } DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall64: error=%d retval=(%llu,%llu)\n", error, regs->rax, regs->rdx); uthread->uu_flag &= ~UT_NOTCANCELPT; if (__improbable(uthread->uu_lowpri_window)) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call * collided with normal I/O operations... we'll * delay in order to mitigate the impact of this * task on the normal operation of the system */ throttle_lowpri_io(1); } if (__probable(code != 180)) KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); thread_exception_return(); /* NOTREACHED */ }
kern_return_t task_name_for_pid( struct task_name_for_pid_args *args) { mach_port_name_t target_tport = args->target_tport; int pid = args->pid; user_addr_t task_addr = args->t; struct uthread *uthread; proc_t p = PROC_NULL; task_t t1; mach_port_name_t tret; void * sright; int error = 0, refheld = 0; kauth_cred_t target_cred; AUDIT_MACH_SYSCALL_ENTER(AUE_TASKNAMEFORPID); AUDIT_ARG(pid, pid); AUDIT_ARG(mach_port1, target_tport); t1 = port_name_to_task(target_tport); if (t1 == TASK_NULL) { (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t)); AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE); return(KERN_FAILURE); } /* * Delayed binding of thread credential to process credential, if we * are not running with an explicitly set thread credential. */ uthread = get_bsdthread_info(current_thread()); kauth_cred_uthread_update(uthread, current_proc()); p = proc_find(pid); AUDIT_ARG(process, p); if (p != PROC_NULL) { target_cred = kauth_cred_proc_ref(p); refheld = 1; if ((p->p_stat != SZOMB) && ((current_proc() == p) || kauth_cred_issuser(kauth_cred_get()) || ((kauth_cred_getuid(target_cred) == kauth_cred_getuid(kauth_cred_get())) && ((target_cred->cr_ruid == kauth_cred_get()->cr_ruid))))) { if (p->task != TASK_NULL) { task_reference(p->task); #if CONFIG_MACF error = mac_proc_check_get_task_name(kauth_cred_get(), p); if (error) { task_deallocate(p->task); goto noperm; } #endif sright = (void *)convert_task_name_to_port(p->task); tret = ipc_port_copyout_send(sright, get_task_ipcspace(current_task())); } else tret = MACH_PORT_NULL; AUDIT_ARG(mach_port2, tret); (void) copyout((char *)&tret, task_addr, sizeof(mach_port_name_t)); task_deallocate(t1); error = KERN_SUCCESS; goto tnfpout; } } #if CONFIG_MACF noperm: #endif task_deallocate(t1); tret = MACH_PORT_NULL; (void) copyout((char *) &tret, task_addr, sizeof(mach_port_name_t)); error = KERN_FAILURE; tnfpout: if (refheld != 0) kauth_cred_unref(&target_cred); if (p != PROC_NULL) proc_rele(p); AUDIT_MACH_SYSCALL_EXIT(error); return(error); }
/* * Routine: task_for_pid * Purpose: * Get the task port for another "process", named by its * process ID on the same host as "target_task". * * Only permitted to privileged processes, or processes * with the same user ID. * * XXX This should be a BSD system call, not a Mach trap!!! */ kern_return_t task_for_pid( struct task_for_pid_args *args) { mach_port_name_t target_tport = args->target_tport; int pid = args->pid; user_addr_t task_addr = args->t; struct uthread *uthread; proc_t p = PROC_NULL; task_t t1 = TASK_NULL; mach_port_name_t tret = MACH_PORT_NULL; ipc_port_t tfpport; void * sright; int error = 0; AUDIT_MACH_SYSCALL_ENTER(AUE_TASKFORPID); AUDIT_ARG(pid, pid); AUDIT_ARG(mach_port1, target_tport); #if defined(SECURE_KERNEL) if (0 == pid) { (void ) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t)); AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE); return(KERN_FAILURE); } #endif t1 = port_name_to_task(target_tport); if (t1 == TASK_NULL) { (void) copyout((char *)&t1, task_addr, sizeof(mach_port_name_t)); AUDIT_MACH_SYSCALL_EXIT(KERN_FAILURE); return(KERN_FAILURE); } /* * Delayed binding of thread credential to process credential, if we * are not running with an explicitly set thread credential. */ uthread = get_bsdthread_info(current_thread()); kauth_cred_uthread_update(uthread, current_proc()); p = proc_find(pid); AUDIT_ARG(process, p); if (!(task_for_pid_posix_check(p))) { error = KERN_FAILURE; goto tfpout; } if (p->task != TASK_NULL) { /* If we aren't root and target's task access port is set... */ if (!kauth_cred_issuser(kauth_cred_get()) && p != current_proc() && (task_get_task_access_port(p->task, &tfpport) == 0) && (tfpport != IPC_PORT_NULL)) { if (tfpport == IPC_PORT_DEAD) { error = KERN_PROTECTION_FAILURE; goto tfpout; } /* Call up to the task access server */ error = check_task_access(tfpport, proc_selfpid(), kauth_getgid(), pid); if (error != MACH_MSG_SUCCESS) { if (error == MACH_RCV_INTERRUPTED) error = KERN_ABORTED; else error = KERN_FAILURE; goto tfpout; } } #if CONFIG_MACF error = mac_proc_check_get_task(kauth_cred_get(), p); if (error) { error = KERN_FAILURE; goto tfpout; } #endif /* Grant task port access */ task_reference(p->task); sright = (void *) convert_task_to_port(p->task); tret = ipc_port_copyout_send( sright, get_task_ipcspace(current_task())); } error = KERN_SUCCESS; tfpout: task_deallocate(t1); AUDIT_ARG(mach_port2, tret); (void) copyout((char *) &tret, task_addr, sizeof(mach_port_name_t)); if (p != PROC_NULL) proc_rele(p); AUDIT_MACH_SYSCALL_EXIT(error); return(error); }
kern_return_t dtrace_user_probe(arm_saved_state_t *regs, unsigned int instr) { /* * FIXME * * The only call path into this method is always a user trap. * We don't need to test for user trap, but should assert it. */ lck_rw_t *rwp; struct proc *p = current_proc(); uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); kauth_cred_uthread_update(uthread, p); if (((regs->cpsr & PSR_TF) && ((uint16_t) instr) == FASTTRAP_THUMB_RET_INSTR) || ((uint32_t) instr == FASTTRAP_ARM_RET_INSTR)) { uint8_t step = uthread->t_dtrace_step; uint8_t ret = uthread->t_dtrace_ret; user_addr_t npc = uthread->t_dtrace_npc; if (uthread->t_dtrace_ast) { printf("dtrace_user_probe() should be calling aston()\n"); // aston(thread); // uthread->t_sig_check = 1; } /* * Clear all user tracing flags. */ uthread->t_dtrace_ft = 0; /* * If we weren't expecting to take a return probe trap, kill * the process as though it had just executed an unassigned * trap instruction. */ if (step == 0) { /* * APPLE NOTE: We're returning KERN_FAILURE, which causes * the generic signal handling code to take over, which will effectively * deliver a EXC_BAD_INSTRUCTION to the user process. */ return KERN_FAILURE; } /* * If we hit this trap unrelated to a return probe, we're * just here to reset the AST flag since we deferred a signal * until after we logically single-stepped the instruction we * copied out. */ if (ret == 0) { regs->pc = npc; return KERN_SUCCESS; } /* * We need to wait until after we've called the * dtrace_return_probe_ptr function pointer to step the pc. */ rwp = &CPU->cpu_ft_lock; lck_rw_lock_shared(rwp); if (dtrace_return_probe_ptr != NULL) (void) (*dtrace_return_probe_ptr)(regs); lck_rw_unlock_shared(rwp); regs->pc = npc; return KERN_SUCCESS; } else { rwp = &CPU->cpu_ft_lock; /* * The DTrace fasttrap provider uses a trap, * FASTTRAP_{ARM,THUMB}_INSTR. We let * DTrace take the first crack at handling * this trap; if it's not a probe that DTrace knows about, * we call into the trap() routine to handle it like a * breakpoint placed by a conventional debugger. */ /* * APPLE NOTE: I believe the purpose of the reader/writers lock * is thus: There are times which dtrace needs to prevent calling * dtrace_pid_probe_ptr(). Sun's original impl grabbed a plain * mutex here. However, that serialized all probe calls, and * destroyed MP behavior. So now they use a RW lock, with probes * as readers, and the top level synchronization as a writer. */ lck_rw_lock_shared(rwp); if (dtrace_pid_probe_ptr != NULL && (*dtrace_pid_probe_ptr)(regs) == 0) { lck_rw_unlock_shared(rwp); return KERN_SUCCESS; } lck_rw_unlock_shared(rwp); /* * If the instruction that caused the breakpoint trap doesn't * look like our trap anymore, it may be that this tracepoint * was removed just after the user thread executed it. In * that case, return to user land to retry the instuction. * * Note that the PC points to the instruction that caused the fault. */ if (regs->cpsr & PSR_TF) { uint16_t instr_check; if (fuword16(regs->pc, &instr_check) == 0 && instr_check != FASTTRAP_THUMB_INSTR) { return KERN_SUCCESS; } } else { uint32_t instr_check; if (fuword32(regs->pc, &instr_check) == 0 && instr_check != FASTTRAP_ARM_INSTR) { return KERN_SUCCESS; } } } return KERN_FAILURE; }
kern_return_t dtrace_user_probe(x86_saved_state_t *regs) { x86_saved_state64_t *regs64; x86_saved_state32_t *regs32; int trapno; /* * FIXME! * * The only call path into this method is always a user trap. * We don't need to test for user trap, but should assert it. */ boolean_t user_mode = TRUE; if (is_saved_state64(regs) == TRUE) { regs64 = saved_state64(regs); regs32 = NULL; trapno = regs64->isf.trapno; user_mode = TRUE; // By default, because xnu is 32 bit only } else { regs64 = NULL; regs32 = saved_state32(regs); if (regs32->cs & 0x03) user_mode = TRUE; trapno = regs32->trapno; } lck_rw_t *rwp; struct proc *p = current_proc(); uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); if (user_mode /*|| (rp->r_ps & PS_VM)*/) { /* * DTrace accesses t_cred in probe context. t_cred * must always be either NULL, or point to a valid, * allocated cred structure. */ kauth_cred_uthread_update(uthread, p); } if (trapno == T_DTRACE_RET) { uint8_t step = uthread->t_dtrace_step; uint8_t ret = uthread->t_dtrace_ret; user_addr_t npc = uthread->t_dtrace_npc; if (uthread->t_dtrace_ast) { printf("dtrace_user_probe() should be calling aston()\n"); // aston(uthread); // uthread->t_sig_check = 1; } /* * Clear all user tracing flags. */ uthread->t_dtrace_ft = 0; /* * If we weren't expecting to take a return probe trap, kill * the process as though it had just executed an unassigned * trap instruction. */ if (step == 0) { /* * APPLE NOTE: We're returning KERN_FAILURE, which causes * the generic signal handling code to take over, which will effectively * deliver a EXC_BAD_INSTRUCTION to the user process. */ return KERN_FAILURE; } /* * If we hit this trap unrelated to a return probe, we're * just here to reset the AST flag since we deferred a signal * until after we logically single-stepped the instruction we * copied out. */ if (ret == 0) { if (regs64) { regs64->isf.rip = npc; } else { regs32->eip = npc; } return KERN_SUCCESS; } /* * We need to wait until after we've called the * dtrace_return_probe_ptr function pointer to set %pc. */ rwp = &CPU->cpu_ft_lock; lck_rw_lock_shared(rwp); if (dtrace_return_probe_ptr != NULL) (void) (*dtrace_return_probe_ptr)(regs); lck_rw_unlock_shared(rwp); if (regs64) { regs64->isf.rip = npc; } else { regs32->eip = npc; } return KERN_SUCCESS; } else if (trapno == T_INT3) { uint8_t instr; rwp = &CPU->cpu_ft_lock; /* * The DTrace fasttrap provider uses the breakpoint trap * (int 3). We let DTrace take the first crack at handling * this trap; if it's not a probe that DTrace knowns about, * we call into the trap() routine to handle it like a * breakpoint placed by a conventional debugger. */ /* * APPLE NOTE: I believe the purpose of the reader/writers lock * is thus: There are times which dtrace needs to prevent calling * dtrace_pid_probe_ptr(). Sun's original impl grabbed a plain * mutex here. However, that serialized all probe calls, and * destroyed MP behavior. So now they use a RW lock, with probes * as readers, and the top level synchronization as a writer. */ lck_rw_lock_shared(rwp); if (dtrace_pid_probe_ptr != NULL && (*dtrace_pid_probe_ptr)(regs) == 0) { lck_rw_unlock_shared(rwp); return KERN_SUCCESS; } lck_rw_unlock_shared(rwp); /* * If the instruction that caused the breakpoint trap doesn't * look like an int 3 anymore, it may be that this tracepoint * was removed just after the user thread executed it. In * that case, return to user land to retry the instuction. */ user_addr_t pc = (regs64) ? regs64->isf.rip : (user_addr_t)regs32->eip; if (fuword8(pc - 1, &instr) == 0 && instr != FASTTRAP_INSTR) { if (regs64) { regs64->isf.rip--; } else { regs32->eip--; } return KERN_SUCCESS; } } return KERN_FAILURE; }