void unix_syscall_return(int error) { thread_act_t thread; volatile int *rval; struct i386_saved_state *regs; struct proc *p; struct proc *current_proc(); unsigned short code; vm_offset_t params; struct sysent *callp; extern int nsysent; thread = current_act(); rval = (int *)get_bsduthreadrval(thread); p = current_proc(); regs = USER_REGS(thread); /* reconstruct code for tracing before blasting eax */ code = regs->eax; params = (vm_offset_t) ((caddr_t)regs->uesp + sizeof (int)); callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; if (callp == sysent) { code = fuword(params); } if (error == ERESTART) { regs->eip -= 7; } else if (error != EJUSTRETURN) { if (error) { regs->eax = error; regs->efl |= EFL_CF; /* carry bit */ } else { /* (not error) */ regs->eax = rval[0]; regs->edx = rval[1]; regs->efl &= ~EFL_CF; } } ktrsysret(p, code, error, rval[0], callp->sy_funnel); KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, rval[0], rval[1], 0, 0); if (callp->sy_funnel != NO_FUNNEL) (void) thread_funnel_set(current_thread()->funnel_lock, FALSE); thread_exception_return(); /* NOTREACHED */ }
static int sd_callback3(proc_t p, void * args) { struct sd_iterargs * sd = (struct sd_iterargs *)args; vfs_context_t ctx = vfs_context_current(); int setsdstate = sd->setsdstate; proc_lock(p); p->p_shutdownstate = setsdstate; if (p->p_stat != SZOMB) { /* * NOTE: following code ignores sig_lock and plays * with exit_thread correctly. This is OK unless we * are a multiprocessor, in which case I do not * understand the sig_lock. This needs to be fixed. * XXX */ if (p->exit_thread) { /* someone already doing it */ proc_unlock(p); /* give him a chance */ thread_block(THREAD_CONTINUE_NULL); } else { p->exit_thread = current_thread(); printf("."); sd_log(ctx, "%s[%d] had to be forced closed with exit1().\n", p->p_comm, p->p_pid); proc_unlock(p); KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_FRCEXIT) | DBG_FUNC_NONE, p->p_pid, 0, 1, 0, 0); sd->activecount++; exit1(p, 1, (int *)NULL); } } else { proc_unlock(p); } return PROC_RETURNED; }
/* * Function: unix_syscall * * Inputs: regs - pointer to i386 save area * * Outputs: none */ void unix_syscall(x86_saved_state_t *state) { thread_t thread; void *vt; unsigned int code; struct sysent *callp; int error; vm_offset_t params; struct proc *p; struct uthread *uthread; x86_saved_state32_t *regs; boolean_t is_vfork; assert(is_saved_state32(state)); regs = saved_state32(state); #if DEBUG if (regs->eax == 0x800) thread_exception_return(); #endif thread = current_thread(); uthread = get_bsdthread_info(thread); /* Get the approriate proc; may be different from task's for vfork() */ is_vfork = uthread->uu_flag & UT_VFORK; if (__improbable(is_vfork != 0)) p = current_proc(); else p = (struct proc *)get_bsdtask_info(current_task()); /* Verify that we are not being called from a task without a proc */ if (__improbable(p == NULL)) { regs->eax = EPERM; regs->efl |= EFL_CF; task_terminate_internal(current_task()); thread_exception_return(); /* NOTREACHED */ } code = regs->eax & I386_SYSCALL_NUMBER_MASK; DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n", code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip); params = (vm_offset_t) (regs->uesp + sizeof (int)); regs->efl &= ~(EFL_CF); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; if (__improbable(callp == sysent)) { code = fuword(params); params += sizeof(int); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; } vt = (void *)uthread->uu_arg; if (callp->sy_arg_bytes != 0) { #if CONFIG_REQUIRES_U32_MUNGING sy_munge_t *mungerp; #else #error U32 syscalls on x86_64 kernel requires munging #endif uint32_t nargs; assert((unsigned) callp->sy_arg_bytes <= sizeof (uthread->uu_arg)); nargs = callp->sy_arg_bytes; error = copyin((user_addr_t) params, (char *) vt, nargs); if (error) { regs->eax = error; regs->efl |= EFL_CF; thread_exception_return(); /* NOTREACHED */ } if (__probable(code != 180)) { int *ip = (int *)vt; KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, *ip, *(ip+1), *(ip+2), *(ip+3), 0); } #if CONFIG_REQUIRES_U32_MUNGING mungerp = callp->sy_arg_munge32; if (mungerp != NULL) (*mungerp)(vt); #endif } else KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 0, 0, 0, 0, 0); /* * Delayed binding of thread credential to process credential, if we * are not running with an explicitly set thread credential. */ kauth_cred_uthread_update(uthread, p); uthread->uu_rval[0] = 0; uthread->uu_rval[1] = 0; uthread->uu_flag |= UT_NOTCANCELPT; uthread->syscall_code = code; #ifdef JOE_DEBUG uthread->uu_iocount = 0; uthread->uu_vpindex = 0; #endif AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, p, uthread, error); #ifdef JOE_DEBUG if (uthread->uu_iocount) printf("system call returned with uu_iocount != 0\n"); #endif #if CONFIG_DTRACE uthread->t_dtrace_errno = error; #endif /* CONFIG_DTRACE */ if (__improbable(error == ERESTART)) { /* * Move the user's pc back to repeat the syscall: * 5 bytes for a sysenter, or 2 for an int 8x. * The SYSENTER_TF_CS covers single-stepping over a sysenter * - see debug trap handler in idt.s/idt64.s */ pal_syscall_restart(thread, state); } else if (error != EJUSTRETURN) { if (__improbable(error)) { regs->eax = error; regs->efl |= EFL_CF; /* carry bit */ } else { /* (not error) */ /* * We split retval across two registers, in case the * syscall had a 64-bit return value, in which case * eax/edx matches the function call ABI. */ regs->eax = uthread->uu_rval[0]; regs->edx = uthread->uu_rval[1]; } } DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall: error=%d retval=(%u,%u)\n", error, regs->eax, regs->edx); uthread->uu_flag &= ~UT_NOTCANCELPT; if (__improbable(uthread->uu_lowpri_window)) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call * collided with normal I/O operations... we'll * delay in order to mitigate the impact of this * task on the normal operation of the system */ throttle_lowpri_io(1); } if (__probable(code != 180)) KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) { pal_execve_return(thread); } thread_exception_return(); /* NOTREACHED */ }
void unix_syscall_return(int error) { thread_t thread; struct uthread *uthread; struct proc *p; unsigned int code; struct sysent *callp; thread = current_thread(); uthread = get_bsdthread_info(thread); pal_register_cache_state(thread, DIRTY); p = current_proc(); if (proc_is64bit(p)) { x86_saved_state64_t *regs; regs = saved_state64(find_user_regs(thread)); code = uthread->syscall_code; callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; #if CONFIG_DTRACE if (callp->sy_call == dtrace_systrace_syscall) dtrace_systrace_syscall_return( code, error, uthread->uu_rval ); #endif /* CONFIG_DTRACE */ AUDIT_SYSCALL_EXIT(code, p, uthread, error); if (error == ERESTART) { /* * repeat the syscall */ pal_syscall_restart( thread, find_user_regs(thread) ); } else if (error != EJUSTRETURN) { if (error) { regs->rax = error; regs->isf.rflags |= EFL_CF; /* carry bit */ } else { /* (not error) */ switch (callp->sy_return_type) { case _SYSCALL_RET_INT_T: regs->rax = uthread->uu_rval[0]; regs->rdx = uthread->uu_rval[1]; break; case _SYSCALL_RET_UINT_T: regs->rax = ((u_int)uthread->uu_rval[0]); regs->rdx = ((u_int)uthread->uu_rval[1]); break; case _SYSCALL_RET_OFF_T: case _SYSCALL_RET_ADDR_T: case _SYSCALL_RET_SIZE_T: case _SYSCALL_RET_SSIZE_T: case _SYSCALL_RET_UINT64_T: regs->rax = *((uint64_t *)(&uthread->uu_rval[0])); regs->rdx = 0; break; case _SYSCALL_RET_NONE: break; default: panic("unix_syscall: unknown return type"); break; } regs->isf.rflags &= ~EFL_CF; } } DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall_return: error=%d retval=(%llu,%llu)\n", error, regs->rax, regs->rdx); } else { x86_saved_state32_t *regs; regs = saved_state32(find_user_regs(thread)); regs->efl &= ~(EFL_CF); code = uthread->syscall_code; callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; #if CONFIG_DTRACE if (callp->sy_call == dtrace_systrace_syscall) dtrace_systrace_syscall_return( code, error, uthread->uu_rval ); #endif /* CONFIG_DTRACE */ AUDIT_SYSCALL_EXIT(code, p, uthread, error); if (error == ERESTART) { pal_syscall_restart( thread, find_user_regs(thread) ); } else if (error != EJUSTRETURN) { if (error) { regs->eax = error; regs->efl |= EFL_CF; /* carry bit */ } else { /* (not error) */ regs->eax = uthread->uu_rval[0]; regs->edx = uthread->uu_rval[1]; } } DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall_return: error=%d retval=(%u,%u)\n", error, regs->eax, regs->edx); } uthread->uu_flag &= ~UT_NOTCANCELPT; if (uthread->uu_lowpri_window) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call * collided with normal I/O operations... we'll * delay in order to mitigate the impact of this * task on the normal operation of the system */ throttle_lowpri_io(1); } if (code != 180) KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); thread_exception_return(); /* NOTREACHED */ }
void unix_syscall64(x86_saved_state_t *state) { thread_t thread; void *vt; unsigned int code; struct sysent *callp; int args_in_regs; boolean_t args_start_at_rdi; int error; struct proc *p; struct uthread *uthread; x86_saved_state64_t *regs; assert(is_saved_state64(state)); regs = saved_state64(state); #if DEBUG if (regs->rax == 0x2000800) thread_exception_return(); #endif thread = current_thread(); uthread = get_bsdthread_info(thread); /* Get the approriate proc; may be different from task's for vfork() */ if (__probable(!(uthread->uu_flag & UT_VFORK))) p = (struct proc *)get_bsdtask_info(current_task()); else p = current_proc(); /* Verify that we are not being called from a task without a proc */ if (__improbable(p == NULL)) { regs->rax = EPERM; regs->isf.rflags |= EFL_CF; task_terminate_internal(current_task()); thread_exception_return(); /* NOTREACHED */ } code = regs->rax & SYSCALL_NUMBER_MASK; DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall64: code=%d(%s) rip=%llx\n", code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; vt = (void *)uthread->uu_arg; if (__improbable(callp == sysent)) { /* * indirect system call... system call number * passed as 'arg0' */ code = regs->rdi; callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; args_start_at_rdi = FALSE; args_in_regs = 5; } else { args_start_at_rdi = TRUE; args_in_regs = 6; } if (callp->sy_narg != 0) { assert(callp->sy_narg <= 8); /* size of uu_arg */ args_in_regs = MIN(args_in_regs, callp->sy_narg); memcpy(vt, args_start_at_rdi ? ®s->rdi : ®s->rsi, args_in_regs * sizeof(syscall_arg_t)); if (code != 180) { uint64_t *ip = (uint64_t *)vt; KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0); } if (__improbable(callp->sy_narg > args_in_regs)) { int copyin_count; copyin_count = (callp->sy_narg - args_in_regs) * sizeof(syscall_arg_t); error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&uthread->uu_arg[args_in_regs], copyin_count); if (error) { regs->rax = error; regs->isf.rflags |= EFL_CF; thread_exception_return(); /* NOTREACHED */ } } } else KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, 0, 0, 0, 0, 0); /* * Delayed binding of thread credential to process credential, if we * are not running with an explicitly set thread credential. */ kauth_cred_uthread_update(uthread, p); uthread->uu_rval[0] = 0; uthread->uu_rval[1] = 0; uthread->uu_flag |= UT_NOTCANCELPT; uthread->syscall_code = code; #ifdef JOE_DEBUG uthread->uu_iocount = 0; uthread->uu_vpindex = 0; #endif AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, vt, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, p, uthread, error); #ifdef JOE_DEBUG if (uthread->uu_iocount) printf("system call returned with uu_iocount != 0\n"); #endif #if CONFIG_DTRACE uthread->t_dtrace_errno = error; #endif /* CONFIG_DTRACE */ if (__improbable(error == ERESTART)) { /* * all system calls come through via the syscall instruction * in 64 bit mode... its 2 bytes in length * move the user's pc back to repeat the syscall: */ pal_syscall_restart( thread, state ); } else if (error != EJUSTRETURN) { if (__improbable(error)) { regs->rax = error; regs->isf.rflags |= EFL_CF; /* carry bit */ } else { /* (not error) */ switch (callp->sy_return_type) { case _SYSCALL_RET_INT_T: regs->rax = uthread->uu_rval[0]; regs->rdx = uthread->uu_rval[1]; break; case _SYSCALL_RET_UINT_T: regs->rax = ((u_int)uthread->uu_rval[0]); regs->rdx = ((u_int)uthread->uu_rval[1]); break; case _SYSCALL_RET_OFF_T: case _SYSCALL_RET_ADDR_T: case _SYSCALL_RET_SIZE_T: case _SYSCALL_RET_SSIZE_T: case _SYSCALL_RET_UINT64_T: regs->rax = *((uint64_t *)(&uthread->uu_rval[0])); regs->rdx = 0; break; case _SYSCALL_RET_NONE: break; default: panic("unix_syscall: unknown return type"); break; } regs->isf.rflags &= ~EFL_CF; } } DEBUG_KPRINT_SYSCALL_UNIX( "unix_syscall64: error=%d retval=(%llu,%llu)\n", error, regs->rax, regs->rdx); uthread->uu_flag &= ~UT_NOTCANCELPT; if (__improbable(uthread->uu_lowpri_window)) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call * collided with normal I/O operations... we'll * delay in order to mitigate the impact of this * task on the normal operation of the system */ throttle_lowpri_io(1); } if (__probable(code != 180)) KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); thread_exception_return(); /* NOTREACHED */ }
int msync_nocancel(__unused proc_t p, struct msync_nocancel_args *uap, __unused register_t *retval) { mach_vm_offset_t addr; mach_vm_size_t size; int flags; vm_map_t user_map; int rv; vm_sync_t sync_flags=0; addr = (mach_vm_offset_t) uap->addr; size = (mach_vm_size_t)uap->len; KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_msync) | DBG_FUNC_NONE), (uint32_t)(addr >> 32), (uint32_t)(size >> 32), 0, 0, 0); if (addr & PAGE_MASK_64) { /* UNIX SPEC: user address is not page-aligned, return EINVAL */ return EINVAL; } if (size == 0) { /* * We cannot support this properly without maintaining * list all mmaps done. Cannot use vm_map_entry as they could be * split or coalesced by indepenedant actions. So instead of * inaccurate results, lets just return error as invalid size * specified */ return (EINVAL); /* XXX breaks posix apps */ } flags = uap->flags; /* disallow contradictory flags */ if ((flags & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) return (EINVAL); if (flags & MS_KILLPAGES) sync_flags |= VM_SYNC_KILLPAGES; if (flags & MS_DEACTIVATE) sync_flags |= VM_SYNC_DEACTIVATE; if (flags & MS_INVALIDATE) sync_flags |= VM_SYNC_INVALIDATE; if ( !(flags & (MS_KILLPAGES | MS_DEACTIVATE))) { if (flags & MS_ASYNC) sync_flags |= VM_SYNC_ASYNCHRONOUS; else sync_flags |= VM_SYNC_SYNCHRONOUS; } sync_flags |= VM_SYNC_CONTIGUOUS; /* complain if holes */ user_map = current_map(); rv = mach_vm_msync(user_map, addr, size, sync_flags); switch (rv) { case KERN_SUCCESS: break; case KERN_INVALID_ADDRESS: /* hole in region being sync'ed */ return (ENOMEM); case KERN_FAILURE: return (EIO); default: return (EINVAL); } return (0); }
/* * XXX Internally, we use VM_PROT_* somewhat interchangeably, but the correct * XXX usage is PROT_* from an interface perspective. Thus the values of * XXX VM_PROT_* and PROT_* need to correspond. */ int mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) { /* * Map in special device (must be SHARED) or file */ struct fileproc *fp; register struct vnode *vp; int flags; int prot, file_prot; int err=0; vm_map_t user_map; kern_return_t result; mach_vm_offset_t user_addr; mach_vm_size_t user_size; vm_object_offset_t pageoff; vm_object_offset_t file_pos; int alloc_flags=0; boolean_t docow; vm_prot_t maxprot; void *handle; vm_pager_t pager; int mapanon=0; int fpref=0; int error =0; int fd = uap->fd; user_addr = (mach_vm_offset_t)uap->addr; user_size = (mach_vm_size_t) uap->len; AUDIT_ARG(addr, user_addr); AUDIT_ARG(len, user_size); AUDIT_ARG(fd, uap->fd); prot = (uap->prot & VM_PROT_ALL); #if 3777787 /* * Since the hardware currently does not support writing without * read-before-write, or execution-without-read, if the request is * for write or execute access, we must imply read access as well; * otherwise programs expecting this to work will fail to operate. */ if (prot & (VM_PROT_EXECUTE | VM_PROT_WRITE)) prot |= VM_PROT_READ; #endif /* radar 3777787 */ flags = uap->flags; vp = NULLVP; /* * The vm code does not have prototypes & compiler doesn't do the' * the right thing when you cast 64bit value and pass it in function * call. So here it is. */ file_pos = (vm_object_offset_t)uap->pos; /* make sure mapping fits into numeric range etc */ if (file_pos + user_size > (vm_object_offset_t)-PAGE_SIZE_64) return (EINVAL); /* * Align the file position to a page boundary, * and save its page offset component. */ pageoff = (file_pos & PAGE_MASK); file_pos -= (vm_object_offset_t)pageoff; /* Adjust size for rounding (on both ends). */ user_size += pageoff; /* low end... */ user_size = mach_vm_round_page(user_size); /* hi end */ /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & MAP_FIXED) { /* * The specified address must have the same remainder * as the file offset taken modulo PAGE_SIZE, so it * should be aligned after adjustment by pageoff. */ user_addr -= pageoff; if (user_addr & PAGE_MASK) return (EINVAL); } #ifdef notyet /* DO not have apis to get this info, need to wait till then*/ /* * XXX for non-fixed mappings where no hint is provided or * the hint would fall in the potential heap space, * place it after the end of the largest possible heap. * * There should really be a pmap call to determine a reasonable * location. */ else if (addr < mach_vm_round_page(p->p_vmspace->vm_daddr + MAXDSIZ)) addr = mach_vm_round_page(p->p_vmspace->vm_daddr + MAXDSIZ); #endif alloc_flags = 0; if (flags & MAP_ANON) { /* * Mapping blank space is trivial. Use positive fds as the alias * value for memory tracking. */ if (fd != -1) { /* * Use "fd" to pass (some) Mach VM allocation flags, * (see the VM_FLAGS_* definitions). */ alloc_flags = fd & (VM_FLAGS_ALIAS_MASK | VM_FLAGS_PURGABLE); if (alloc_flags != fd) { /* reject if there are any extra flags */ return EINVAL; } } handle = NULL; maxprot = VM_PROT_ALL; file_pos = 0; mapanon = 1; } else { struct vnode_attr va; vfs_context_t ctx = vfs_context_current(); /* * Mapping file, get fp for validation. Obtain vnode and make * sure it is of appropriate type. */ err = fp_lookup(p, fd, &fp, 0); if (err) return(err); fpref = 1; if(fp->f_fglob->fg_type == DTYPE_PSXSHM) { uap->addr = (user_addr_t)user_addr; uap->len = (user_size_t)user_size; uap->prot = prot; uap->flags = flags; uap->pos = file_pos; error = pshm_mmap(p, uap, retval, fp, (off_t)pageoff); goto bad; } if (fp->f_fglob->fg_type != DTYPE_VNODE) { error = EINVAL; goto bad; } vp = (struct vnode *)fp->f_fglob->fg_data; error = vnode_getwithref(vp); if(error != 0) goto bad; if (vp->v_type != VREG && vp->v_type != VCHR) { (void)vnode_put(vp); error = EINVAL; goto bad; } AUDIT_ARG(vnpath, vp, ARG_VNODE1); /* * POSIX: mmap needs to update access time for mapped files */ if ((vnode_vfsvisflags(vp) & MNT_NOATIME) == 0) { VATTR_INIT(&va); nanotime(&va.va_access_time); VATTR_SET_ACTIVE(&va, va_access_time); vnode_setattr(vp, &va, ctx); } /* * XXX hack to handle use of /dev/zero to map anon memory (ala * SunOS). */ if (vp->v_type == VCHR || vp->v_type == VSTR) { (void)vnode_put(vp); error = ENODEV; goto bad; } else { /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? What if * proc does a setuid? */ maxprot = VM_PROT_EXECUTE; /* ??? */ if (fp->f_fglob->fg_flag & FREAD) maxprot |= VM_PROT_READ; else if (prot & PROT_READ) { (void)vnode_put(vp); error = EACCES; goto bad; } /* * If we are sharing potential changes (either via * MAP_SHARED or via the implicit sharing of character * device mappings), and we are trying to get write * permission although we opened it without asking * for it, bail out. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_fglob->fg_flag & FWRITE) != 0) { /* * check for write access * * Note that we already made this check when granting FWRITE * against the file, so it seems redundant here. */ error = vnode_authorize(vp, NULL, KAUTH_VNODE_CHECKIMMUTABLE, ctx); /* if not granted for any reason, but we wanted it, bad */ if ((prot & PROT_WRITE) && (error != 0)) { vnode_put(vp); goto bad; } /* if writable, remember */ if (error == 0) maxprot |= VM_PROT_WRITE; } else if ((prot & PROT_WRITE) != 0) { (void)vnode_put(vp); error = EACCES; goto bad; } } else maxprot |= VM_PROT_WRITE; handle = (void *)vp; #if CONFIG_MACF error = mac_file_check_mmap(vfs_context_ucred(ctx), fp->f_fglob, prot, flags, &maxprot); if (error) { (void)vnode_put(vp); goto bad; } #endif /* MAC */ } } if (user_size == 0) { if (!mapanon) (void)vnode_put(vp); error = 0; goto bad; } /* * We bend a little - round the start and end addresses * to the nearest page boundary. */ user_size = mach_vm_round_page(user_size); if (file_pos & PAGE_MASK_64) { if (!mapanon) (void)vnode_put(vp); error = EINVAL; goto bad; } user_map = current_map(); if ((flags & MAP_FIXED) == 0) { alloc_flags |= VM_FLAGS_ANYWHERE; user_addr = mach_vm_round_page(user_addr); } else { if (user_addr != mach_vm_trunc_page(user_addr)) { if (!mapanon) (void)vnode_put(vp); error = EINVAL; goto bad; } /* * mmap(MAP_FIXED) will replace any existing mappings in the * specified range, if the new mapping is successful. * If we just deallocate the specified address range here, * another thread might jump in and allocate memory in that * range before we get a chance to establish the new mapping, * and we won't have a chance to restore the old mappings. * So we use VM_FLAGS_OVERWRITE to let Mach VM know that it * has to deallocate the existing mappings and establish the * new ones atomically. */ alloc_flags |= VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE; } if (flags & MAP_NOCACHE) alloc_flags |= VM_FLAGS_NO_CACHE; /* * Lookup/allocate object. */ if (handle == NULL) { pager = NULL; #ifdef notyet /* Hmm .. */ #if defined(VM_PROT_READ_IS_EXEC) if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; if (maxprot & VM_PROT_READ) maxprot |= VM_PROT_EXECUTE; #endif #endif #if 3777787 if (prot & (VM_PROT_EXECUTE | VM_PROT_WRITE)) prot |= VM_PROT_READ; if (maxprot & (VM_PROT_EXECUTE | VM_PROT_WRITE)) maxprot |= VM_PROT_READ; #endif /* radar 3777787 */ result = vm_map_enter_mem_object(user_map, &user_addr, user_size, 0, alloc_flags, IPC_PORT_NULL, 0, FALSE, prot, maxprot, (flags & MAP_SHARED) ? VM_INHERIT_SHARE : VM_INHERIT_DEFAULT); if (result != KERN_SUCCESS) goto out; } else { pager = (vm_pager_t)ubc_getpager(vp); if (pager == NULL) { (void)vnode_put(vp); error = ENOMEM; goto bad; } /* * Set credentials: * FIXME: if we're writing the file we need a way to * ensure that someone doesn't replace our R/W creds * with ones that only work for read. */ ubc_setthreadcred(vp, p, current_thread()); docow = FALSE; if ((flags & (MAP_ANON|MAP_SHARED)) == 0) { docow = TRUE; } #ifdef notyet /* Hmm .. */ #if defined(VM_PROT_READ_IS_EXEC) if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; if (maxprot & VM_PROT_READ) maxprot |= VM_PROT_EXECUTE; #endif #endif /* notyet */ #if 3777787 if (prot & (VM_PROT_EXECUTE | VM_PROT_WRITE)) prot |= VM_PROT_READ; if (maxprot & (VM_PROT_EXECUTE | VM_PROT_WRITE)) maxprot |= VM_PROT_READ; #endif /* radar 3777787 */ result = vm_map_enter_mem_object(user_map, &user_addr, user_size, 0, alloc_flags, (ipc_port_t)pager, file_pos, docow, prot, maxprot, (flags & MAP_SHARED) ? VM_INHERIT_SHARE : VM_INHERIT_DEFAULT); if (result != KERN_SUCCESS) { (void)vnode_put(vp); goto out; } file_prot = prot & (PROT_READ | PROT_WRITE | PROT_EXEC); if (docow) { /* private mapping: won't write to the file */ file_prot &= ~PROT_WRITE; } (void) ubc_map(vp, file_prot); } if (!mapanon) (void)vnode_put(vp); out: switch (result) { case KERN_SUCCESS: *retval = user_addr + pageoff; error = 0; break; case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: error = ENOMEM; break; case KERN_PROTECTION_FAILURE: error = EACCES; break; default: error = EINVAL; break; } bad: if (fpref) fp_drop(p, fd, fp, 0); KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_mmap) | DBG_FUNC_NONE), fd, (uint32_t)(*retval), (uint32_t)user_size, error, 0); KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO2, SYS_mmap) | DBG_FUNC_NONE), (uint32_t)(*retval >> 32), (uint32_t)(user_size >> 32), (uint32_t)(file_pos >> 32), (uint32_t)file_pos, 0); return(error); }
void unix_syscall(struct i386_saved_state *regs) { thread_act_t thread; void *vt; unsigned short code; struct sysent *callp; int nargs, error; volatile int *rval; int funnel_type; vm_offset_t params; extern int nsysent; struct proc *p; struct proc *current_proc(); thread = current_act(); p = current_proc(); rval = (int *)get_bsduthreadrval(thread); //printf("[scall : eax %x]", regs->eax); code = regs->eax; params = (vm_offset_t) ((caddr_t)regs->uesp + sizeof (int)); callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; if (callp == sysent) { code = fuword(params); params += sizeof (int); callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; } vt = get_bsduthreadarg(thread); if ((nargs = (callp->sy_narg * sizeof (int))) && (error = copyin((char *) params, (char *)vt , nargs)) != 0) { regs->eax = error; regs->efl |= EFL_CF; thread_exception_return(); /* NOTREACHED */ } rval[0] = 0; rval[1] = regs->edx; funnel_type = callp->sy_funnel; if(funnel_type == KERNEL_FUNNEL) (void) thread_funnel_set(kernel_flock, TRUE); else if (funnel_type == NETWORK_FUNNEL) (void) thread_funnel_set(network_flock, TRUE); set_bsduthreadargs(thread, regs, NULL); if (callp->sy_narg > 8) panic("unix_syscall max arg count exceeded (%d)", callp->sy_narg); ktrsyscall(p, code, callp->sy_narg, vt, funnel_type); { int *ip = (int *)vt; KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, *ip, *(ip+1), *(ip+2), *(ip+3), 0); } error = (*(callp->sy_call))(p, (void *) vt, (int *) &rval[0]); #if 0 /* May be needed with vfork changes */ regs = USER_REGS(thread); #endif if (error == ERESTART) { regs->eip -= 7; } else if (error != EJUSTRETURN) { if (error) { regs->eax = error; regs->efl |= EFL_CF; /* carry bit */ } else { /* (not error) */ regs->eax = rval[0]; regs->edx = rval[1]; regs->efl &= ~EFL_CF; } } ktrsysret(p, code, error, rval[0], funnel_type); KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, rval[0], rval[1], 0, 0); if(funnel_type != NO_FUNNEL) (void) thread_funnel_set(current_thread()->funnel_lock, FALSE); thread_exception_return(); /* NOTREACHED */ }
int ptrace(struct proc *p, struct ptrace_args *uap, int32_t *retval) { struct proc *t = current_proc(); /* target process */ task_t task; thread_t th_act; struct uthread *ut; int tr_sigexc = 0; int error = 0; int stopped = 0; AUDIT_ARG(cmd, uap->req); AUDIT_ARG(pid, uap->pid); AUDIT_ARG(addr, uap->addr); AUDIT_ARG(value32, uap->data); if (uap->req == PT_DENY_ATTACH) { proc_lock(p); if (ISSET(p->p_lflag, P_LTRACED)) { proc_unlock(p); KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_FRCEXIT) | DBG_FUNC_NONE, p->p_pid, W_EXITCODE(ENOTSUP, 0), 4, 0, 0); exit1(p, W_EXITCODE(ENOTSUP, 0), retval); thread_exception_return(); /* NOTREACHED */ } SET(p->p_lflag, P_LNOATTACH); proc_unlock(p); return(0); } if (uap->req == PT_FORCEQUOTA) { if (kauth_cred_issuser(kauth_cred_get())) { OSBitOrAtomic(P_FORCEQUOTA, &t->p_flag); return (0); } else return (EPERM); } /* * Intercept and deal with "please trace me" request. */ if (uap->req == PT_TRACE_ME) { retry_trace_me:; proc_t pproc = proc_parent(p); if (pproc == NULL) return (EINVAL); #if CONFIG_MACF /* * NB: Cannot call kauth_authorize_process(..., KAUTH_PROCESS_CANTRACE, ...) * since that assumes the process being checked is the current process * when, in this case, it is the current process's parent. * Most of the other checks in cantrace() don't apply either. */ if ((error = mac_proc_check_debug(pproc, p)) == 0) { #endif proc_lock(p); /* Make sure the process wasn't re-parented. */ if (p->p_ppid != pproc->p_pid) { proc_unlock(p); proc_rele(pproc); goto retry_trace_me; } SET(p->p_lflag, P_LTRACED); /* Non-attached case, our tracer is our parent. */ p->p_oppid = p->p_ppid; proc_unlock(p); /* Child and parent will have to be able to run modified code. */ cs_allow_invalid(p); cs_allow_invalid(pproc); #if CONFIG_MACF } #endif proc_rele(pproc); return (error); } if (uap->req == PT_SIGEXC) { proc_lock(p); if (ISSET(p->p_lflag, P_LTRACED)) { SET(p->p_lflag, P_LSIGEXC); proc_unlock(p); return(0); } else { proc_unlock(p); return(EINVAL); } } /* * We do not want ptrace to do anything with kernel or launchd */ if (uap->pid < 2) { return(EPERM); } /* * Locate victim, and make sure it is traceable. */ if ((t = proc_find(uap->pid)) == NULL) return (ESRCH); AUDIT_ARG(process, t); task = t->task; if (uap->req == PT_ATTACHEXC) { #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wdeprecated-declarations" uap->req = PT_ATTACH; tr_sigexc = 1; } if (uap->req == PT_ATTACH) { #pragma clang diagnostic pop int err; if ( kauth_authorize_process(proc_ucred(p), KAUTH_PROCESS_CANTRACE, t, (uintptr_t)&err, 0, 0) == 0 ) { /* it's OK to attach */ proc_lock(t); SET(t->p_lflag, P_LTRACED); if (tr_sigexc) SET(t->p_lflag, P_LSIGEXC); t->p_oppid = t->p_ppid; /* Check whether child and parent are allowed to run modified * code (they'll have to) */ proc_unlock(t); cs_allow_invalid(t); cs_allow_invalid(p); if (t->p_pptr != p) proc_reparentlocked(t, p, 1, 0); proc_lock(t); if (get_task_userstop(task) > 0 ) { stopped = 1; } t->p_xstat = 0; proc_unlock(t); psignal(t, SIGSTOP); /* * If the process was stopped, wake up and run through * issignal() again to properly connect to the tracing * process. */ if (stopped) task_resume(task); error = 0; goto out; } else { /* not allowed to attach, proper error code returned by kauth_authorize_process */ if (ISSET(t->p_lflag, P_LNOATTACH)) { psignal(p, SIGSEGV); } error = err; goto out; } } /* * You can't do what you want to the process if: * (1) It's not being traced at all, */ proc_lock(t); if (!ISSET(t->p_lflag, P_LTRACED)) { proc_unlock(t); error = EPERM; goto out; } /* * (2) it's not being traced by _you_, or */ if (t->p_pptr != p) { proc_unlock(t); error = EBUSY; goto out; } /* * (3) it's not currently stopped. */ if (t->p_stat != SSTOP) { proc_unlock(t); error = EBUSY; goto out; } /* * Mach version of ptrace executes request directly here, * thus simplifying the interaction of ptrace and signals. */ /* proc lock is held here */ switch (uap->req) { case PT_DETACH: if (t->p_oppid != t->p_ppid) { struct proc *pp; proc_unlock(t); pp = proc_find(t->p_oppid); if (pp != PROC_NULL) { proc_reparentlocked(t, pp, 1, 0); proc_rele(pp); } else { /* original parent exited while traced */ proc_list_lock(); t->p_listflag |= P_LIST_DEADPARENT; proc_list_unlock(); proc_reparentlocked(t, initproc, 1, 0); } proc_lock(t); } t->p_oppid = 0; CLR(t->p_lflag, P_LTRACED); CLR(t->p_lflag, P_LSIGEXC); proc_unlock(t); goto resume; case PT_KILL: /* * Tell child process to kill itself after it * is resumed by adding NSIG to p_cursig. [see issig] */ proc_unlock(t); #if CONFIG_MACF error = mac_proc_check_signal(p, t, SIGKILL); if (0 != error) goto resume; #endif psignal(t, SIGKILL); goto resume; case PT_STEP: /* single step the child */ case PT_CONTINUE: /* continue the child */ proc_unlock(t); th_act = (thread_t)get_firstthread(task); if (th_act == THREAD_NULL) { error = EINVAL; goto out; } /* force use of Mach SPIs (and task_for_pid security checks) to adjust PC */ if (uap->addr != (user_addr_t)1) { error = ENOTSUP; goto out; } if ((unsigned)uap->data >= NSIG) { error = EINVAL; goto out; } if (uap->data != 0) { #if CONFIG_MACF error = mac_proc_check_signal(p, t, uap->data); if (0 != error) goto out; #endif psignal(t, uap->data); } if (uap->req == PT_STEP) { /* * set trace bit * we use sending SIGSTOP as a comparable security check. */ #if CONFIG_MACF error = mac_proc_check_signal(p, t, SIGSTOP); if (0 != error) { goto out; } #endif if (thread_setsinglestep(th_act, 1) != KERN_SUCCESS) { error = ENOTSUP; goto out; } } else { /* * clear trace bit if on * we use sending SIGCONT as a comparable security check. */ #if CONFIG_MACF error = mac_proc_check_signal(p, t, SIGCONT); if (0 != error) { goto out; } #endif if (thread_setsinglestep(th_act, 0) != KERN_SUCCESS) { error = ENOTSUP; goto out; } } resume: proc_lock(t); t->p_xstat = uap->data; t->p_stat = SRUN; if (t->sigwait) { wakeup((caddr_t)&(t->sigwait)); proc_unlock(t); if ((t->p_lflag & P_LSIGEXC) == 0) { task_resume(task); } } else proc_unlock(t); break; case PT_THUPDATE: { proc_unlock(t); if ((unsigned)uap->data >= NSIG) { error = EINVAL; goto out; } th_act = port_name_to_thread(CAST_MACH_PORT_TO_NAME(uap->addr)); if (th_act == THREAD_NULL) { error = ESRCH; goto out; } ut = (uthread_t)get_bsdthread_info(th_act); if (uap->data) ut->uu_siglist |= sigmask(uap->data); proc_lock(t); t->p_xstat = uap->data; t->p_stat = SRUN; proc_unlock(t); thread_deallocate(th_act); error = 0; } break; default: proc_unlock(t); error = EINVAL; goto out; } error = 0; out: proc_rele(t); return(error); }