Beispiel #1
0
void
unix_syscall_return(int error)
{
    thread_act_t		thread;
	volatile int *rval;
	struct i386_saved_state *regs;
	struct proc *p;
	struct proc *current_proc();
	unsigned short code;
	vm_offset_t params;
	struct sysent *callp;
	extern int nsysent;

    thread = current_act();
    rval = (int *)get_bsduthreadrval(thread);
	p = current_proc();

	regs = USER_REGS(thread);

	/* reconstruct code for tracing before blasting eax */
	code = regs->eax;
	params = (vm_offset_t) ((caddr_t)regs->uesp + sizeof (int));
	callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
	if (callp == sysent) {
	  code = fuword(params);
	}

	if (error == ERESTART) {
		regs->eip -= 7;
	}
	else if (error != EJUSTRETURN) {
		if (error) {
		    regs->eax = error;
		    regs->efl |= EFL_CF;	/* carry bit */
		} else { /* (not error) */
		    regs->eax = rval[0];
		    regs->edx = rval[1];
		    regs->efl &= ~EFL_CF;
		} 
	}

	ktrsysret(p, code, error, rval[0], callp->sy_funnel);

	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
		error, rval[0], rval[1], 0, 0);

	if (callp->sy_funnel != NO_FUNNEL)
    		(void) thread_funnel_set(current_thread()->funnel_lock, FALSE);

    thread_exception_return();
    /* NOTREACHED */
}
Beispiel #2
0
static int
sd_callback3(proc_t p, void * args)
{
	struct sd_iterargs * sd = (struct sd_iterargs *)args;
	vfs_context_t ctx = vfs_context_current();

	int setsdstate = sd->setsdstate;

	proc_lock(p);
	p->p_shutdownstate = setsdstate;
	if (p->p_stat != SZOMB) {
	       /*
		* NOTE: following code ignores sig_lock and plays
		* with exit_thread correctly.  This is OK unless we
		* are a multiprocessor, in which case I do not
		* understand the sig_lock.  This needs to be fixed.
		* XXX
		*/
		if (p->exit_thread) {	/* someone already doing it */
			proc_unlock(p);
			/* give him a chance */
			thread_block(THREAD_CONTINUE_NULL);
		} else {
			p->exit_thread = current_thread();
			printf(".");

			sd_log(ctx, "%s[%d] had to be forced closed with exit1().\n", p->p_comm, p->p_pid);

			proc_unlock(p);
			KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_FRCEXIT) | DBG_FUNC_NONE,
					      p->p_pid, 0, 1, 0, 0);
			sd->activecount++;
			exit1(p, 1, (int *)NULL);
		}
	} else {
		proc_unlock(p);
	}

	return PROC_RETURNED;
}
Beispiel #3
0
/*
 * Function:	unix_syscall
 *
 * Inputs:	regs	- pointer to i386 save area
 *
 * Outputs:	none
 */
void
unix_syscall(x86_saved_state_t *state)
{
	thread_t		thread;
	void			*vt;
	unsigned int		code;
	struct sysent		*callp;

	int			error;
	vm_offset_t		params;
	struct proc		*p;
	struct uthread		*uthread;
	x86_saved_state32_t	*regs;
	boolean_t		is_vfork;

	assert(is_saved_state32(state));
	regs = saved_state32(state);
#if DEBUG
	if (regs->eax == 0x800)
		thread_exception_return();
#endif
	thread = current_thread();
	uthread = get_bsdthread_info(thread);

	/* Get the approriate proc; may be different from task's for vfork() */
	is_vfork = uthread->uu_flag & UT_VFORK;
	if (__improbable(is_vfork != 0))
		p = current_proc();
	else 
		p = (struct proc *)get_bsdtask_info(current_task());

	/* Verify that we are not being called from a task without a proc */
	if (__improbable(p == NULL)) {
		regs->eax = EPERM;
		regs->efl |= EFL_CF;
		task_terminate_internal(current_task());
		thread_exception_return();
		/* NOTREACHED */
	}

	code = regs->eax & I386_SYSCALL_NUMBER_MASK;
	DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n",
							  code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip);
	params = (vm_offset_t) (regs->uesp + sizeof (int));

	regs->efl &= ~(EFL_CF);

	callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];

	if (__improbable(callp == sysent)) {
		code = fuword(params);
		params += sizeof(int);
		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
	}

	vt = (void *)uthread->uu_arg;

	if (callp->sy_arg_bytes != 0) {
#if CONFIG_REQUIRES_U32_MUNGING
		sy_munge_t	*mungerp;
#else
#error U32 syscalls on x86_64 kernel requires munging
#endif
		uint32_t	 nargs;

		assert((unsigned) callp->sy_arg_bytes <= sizeof (uthread->uu_arg));
		nargs = callp->sy_arg_bytes;
		error = copyin((user_addr_t) params, (char *) vt, nargs);
		if (error) {
			regs->eax = error;
			regs->efl |= EFL_CF;
			thread_exception_return();
			/* NOTREACHED */
		}

		if (__probable(code != 180)) {
	        	int *ip = (int *)vt;

			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
				BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
				*ip, *(ip+1), *(ip+2), *(ip+3), 0);
		}

#if CONFIG_REQUIRES_U32_MUNGING
		mungerp = callp->sy_arg_munge32;

		if (mungerp != NULL)
			(*mungerp)(vt);
#endif
	} else
		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
			0, 0, 0, 0, 0);

	/*
	 * Delayed binding of thread credential to process credential, if we
	 * are not running with an explicitly set thread credential.
	 */
	kauth_cred_uthread_update(uthread, p);

	uthread->uu_rval[0] = 0;
	uthread->uu_rval[1] = 0;
	uthread->uu_flag |= UT_NOTCANCELPT;
	uthread->syscall_code = code;

#ifdef JOE_DEBUG
        uthread->uu_iocount = 0;
        uthread->uu_vpindex = 0;
#endif

	AUDIT_SYSCALL_ENTER(code, p, uthread);
	error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0]));
	AUDIT_SYSCALL_EXIT(code, p, uthread, error);

#ifdef JOE_DEBUG
        if (uthread->uu_iocount)
                printf("system call returned with uu_iocount != 0\n");
#endif
#if CONFIG_DTRACE
	uthread->t_dtrace_errno = error;
#endif /* CONFIG_DTRACE */

	if (__improbable(error == ERESTART)) {
		/*
		 * Move the user's pc back to repeat the syscall:
		 * 5 bytes for a sysenter, or 2 for an int 8x.
		 * The SYSENTER_TF_CS covers single-stepping over a sysenter
		 * - see debug trap handler in idt.s/idt64.s
		 */

		pal_syscall_restart(thread, state);
	}
	else if (error != EJUSTRETURN) {
		if (__improbable(error)) {
		    regs->eax = error;
		    regs->efl |= EFL_CF;	/* carry bit */
		} else { /* (not error) */
			/*
			 * We split retval across two registers, in case the
			 * syscall had a 64-bit return value, in which case
			 * eax/edx matches the function call ABI.
			 */
		    regs->eax = uthread->uu_rval[0];
		    regs->edx = uthread->uu_rval[1];
		} 
	}

	DEBUG_KPRINT_SYSCALL_UNIX(
		"unix_syscall: error=%d retval=(%u,%u)\n",
		error, regs->eax, regs->edx);

	uthread->uu_flag &= ~UT_NOTCANCELPT;

	if (__improbable(uthread->uu_lowpri_window)) {
	        /*
		 * task is marked as a low priority I/O type
		 * and the I/O we issued while in this system call
		 * collided with normal I/O operations... we'll
		 * delay in order to mitigate the impact of this
		 * task on the normal operation of the system
		 */
		throttle_lowpri_io(1);
	}
	if (__probable(code != 180))
		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
			error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);

	if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) {
		pal_execve_return(thread);
	}

	thread_exception_return();
	/* NOTREACHED */
}
Beispiel #4
0
void
unix_syscall_return(int error)
{
	thread_t		thread;
	struct uthread		*uthread;
	struct proc *p;
	unsigned int code;
	struct sysent *callp;

	thread = current_thread();
	uthread = get_bsdthread_info(thread);

	pal_register_cache_state(thread, DIRTY);

	p = current_proc();

	if (proc_is64bit(p)) {
		x86_saved_state64_t *regs;

		regs = saved_state64(find_user_regs(thread));

		code = uthread->syscall_code;
		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];

#if CONFIG_DTRACE
		if (callp->sy_call == dtrace_systrace_syscall)
			dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
#endif /* CONFIG_DTRACE */
		AUDIT_SYSCALL_EXIT(code, p, uthread, error);

		if (error == ERESTART) {
			/*
			 * repeat the syscall
			 */
			pal_syscall_restart( thread, find_user_regs(thread) );
		}
		else if (error != EJUSTRETURN) {
			if (error) {
				regs->rax = error;
				regs->isf.rflags |= EFL_CF;	/* carry bit */
			} else { /* (not error) */

				switch (callp->sy_return_type) {
				case _SYSCALL_RET_INT_T:
					regs->rax = uthread->uu_rval[0];
					regs->rdx = uthread->uu_rval[1];
					break;
				case _SYSCALL_RET_UINT_T:
					regs->rax = ((u_int)uthread->uu_rval[0]);
					regs->rdx = ((u_int)uthread->uu_rval[1]);
					break;
				case _SYSCALL_RET_OFF_T:
				case _SYSCALL_RET_ADDR_T:
				case _SYSCALL_RET_SIZE_T:
				case _SYSCALL_RET_SSIZE_T:
				case _SYSCALL_RET_UINT64_T:
					regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
					regs->rdx = 0;
					break;
				case _SYSCALL_RET_NONE:
					break;
				default:
					panic("unix_syscall: unknown return type");
					break;
				}
				regs->isf.rflags &= ~EFL_CF;
			} 
		}
		DEBUG_KPRINT_SYSCALL_UNIX(
			"unix_syscall_return: error=%d retval=(%llu,%llu)\n",
			error, regs->rax, regs->rdx);
	} else {
		x86_saved_state32_t	*regs;

		regs = saved_state32(find_user_regs(thread));

		regs->efl &= ~(EFL_CF);

		code = uthread->syscall_code;
		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];

#if CONFIG_DTRACE
		if (callp->sy_call == dtrace_systrace_syscall)
			dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
#endif /* CONFIG_DTRACE */
		AUDIT_SYSCALL_EXIT(code, p, uthread, error);

		if (error == ERESTART) {
			pal_syscall_restart( thread, find_user_regs(thread) );
		}
		else if (error != EJUSTRETURN) {
			if (error) {
				regs->eax = error;
				regs->efl |= EFL_CF;	/* carry bit */
			} else { /* (not error) */
				regs->eax = uthread->uu_rval[0];
				regs->edx = uthread->uu_rval[1];
			} 
		}
		DEBUG_KPRINT_SYSCALL_UNIX(
			"unix_syscall_return: error=%d retval=(%u,%u)\n",
			error, regs->eax, regs->edx);
	}


	uthread->uu_flag &= ~UT_NOTCANCELPT;

	if (uthread->uu_lowpri_window) {
	        /*
		 * task is marked as a low priority I/O type
		 * and the I/O we issued while in this system call
		 * collided with normal I/O operations... we'll
		 * delay in order to mitigate the impact of this
		 * task on the normal operation of the system
		 */
		throttle_lowpri_io(1);
	}
	if (code != 180)
		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
			error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);

	thread_exception_return();
	/* NOTREACHED */
}
Beispiel #5
0
void
unix_syscall64(x86_saved_state_t *state)
{
	thread_t	thread;
	void			*vt;
	unsigned int	code;
	struct sysent	*callp;
	int		args_in_regs;
	boolean_t	args_start_at_rdi;
	int		error;
	struct proc	*p;
	struct uthread	*uthread;
	x86_saved_state64_t *regs;

	assert(is_saved_state64(state));
	regs = saved_state64(state);
#if	DEBUG
	if (regs->rax == 0x2000800)
		thread_exception_return();
#endif
	thread = current_thread();
	uthread = get_bsdthread_info(thread);

	/* Get the approriate proc; may be different from task's for vfork() */
	if (__probable(!(uthread->uu_flag & UT_VFORK)))
		p = (struct proc *)get_bsdtask_info(current_task());
	else 
		p = current_proc();

	/* Verify that we are not being called from a task without a proc */
	if (__improbable(p == NULL)) {
		regs->rax = EPERM;
		regs->isf.rflags |= EFL_CF;
		task_terminate_internal(current_task());
		thread_exception_return();
		/* NOTREACHED */
	}

	code = regs->rax & SYSCALL_NUMBER_MASK;
	DEBUG_KPRINT_SYSCALL_UNIX(
		"unix_syscall64: code=%d(%s) rip=%llx\n",
		code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip);
	callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];

	vt = (void *)uthread->uu_arg;

	if (__improbable(callp == sysent)) {
	        /*
		 * indirect system call... system call number
		 * passed as 'arg0'
		 */
		code = regs->rdi;
		callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
		args_start_at_rdi = FALSE;
		args_in_regs = 5;
	} else {
		args_start_at_rdi = TRUE;
		args_in_regs = 6;
	}

	if (callp->sy_narg != 0) {
		assert(callp->sy_narg <= 8); /* size of uu_arg */

		args_in_regs = MIN(args_in_regs, callp->sy_narg);
		memcpy(vt, args_start_at_rdi ? &regs->rdi : &regs->rsi, args_in_regs * sizeof(syscall_arg_t));


		if (code != 180) {
			uint64_t *ip = (uint64_t *)vt;

			KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
				BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
				(int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0);
		}

		if (__improbable(callp->sy_narg > args_in_regs)) {
			int copyin_count;

			copyin_count = (callp->sy_narg - args_in_regs) * sizeof(syscall_arg_t);

			error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&uthread->uu_arg[args_in_regs], copyin_count);
			if (error) {
				regs->rax = error;
				regs->isf.rflags |= EFL_CF;
				thread_exception_return();
				/* NOTREACHED */
			}
		}
	} else
		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
			0, 0, 0, 0, 0);

	/*
	 * Delayed binding of thread credential to process credential, if we
	 * are not running with an explicitly set thread credential.
	 */
	kauth_cred_uthread_update(uthread, p);

	uthread->uu_rval[0] = 0;
	uthread->uu_rval[1] = 0;
	uthread->uu_flag |= UT_NOTCANCELPT;
	uthread->syscall_code = code;

#ifdef JOE_DEBUG
        uthread->uu_iocount = 0;
        uthread->uu_vpindex = 0;
#endif

	AUDIT_SYSCALL_ENTER(code, p, uthread);
	error = (*(callp->sy_call))((void *) p, vt, &(uthread->uu_rval[0]));
	AUDIT_SYSCALL_EXIT(code, p, uthread, error);

#ifdef JOE_DEBUG
        if (uthread->uu_iocount)
               printf("system call returned with uu_iocount != 0\n");
#endif

#if CONFIG_DTRACE
	uthread->t_dtrace_errno = error;
#endif /* CONFIG_DTRACE */
	
	if (__improbable(error == ERESTART)) {
		/*
		 * all system calls come through via the syscall instruction
		 * in 64 bit mode... its 2 bytes in length
		 * move the user's pc back to repeat the syscall:
		 */
		pal_syscall_restart( thread, state );
	}
	else if (error != EJUSTRETURN) {
		if (__improbable(error)) {
			regs->rax = error;
			regs->isf.rflags |= EFL_CF;	/* carry bit */
		} else { /* (not error) */

			switch (callp->sy_return_type) {
			case _SYSCALL_RET_INT_T:
				regs->rax = uthread->uu_rval[0];
				regs->rdx = uthread->uu_rval[1];
				break;
			case _SYSCALL_RET_UINT_T:
				regs->rax = ((u_int)uthread->uu_rval[0]);
				regs->rdx = ((u_int)uthread->uu_rval[1]);
				break;
			case _SYSCALL_RET_OFF_T:
			case _SYSCALL_RET_ADDR_T:
			case _SYSCALL_RET_SIZE_T:
			case _SYSCALL_RET_SSIZE_T:
			case _SYSCALL_RET_UINT64_T:
			        regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
				regs->rdx = 0;
				break;
			case _SYSCALL_RET_NONE:
				break;
			default:
				panic("unix_syscall: unknown return type");
				break;
			}
			regs->isf.rflags &= ~EFL_CF;
		} 
	}

	DEBUG_KPRINT_SYSCALL_UNIX(
		"unix_syscall64: error=%d retval=(%llu,%llu)\n",
		error, regs->rax, regs->rdx);
	
	uthread->uu_flag &= ~UT_NOTCANCELPT;

	if (__improbable(uthread->uu_lowpri_window)) {
	        /*
		 * task is marked as a low priority I/O type
		 * and the I/O we issued while in this system call
		 * collided with normal I/O operations... we'll
		 * delay in order to mitigate the impact of this
		 * task on the normal operation of the system
		 */
		throttle_lowpri_io(1);
	}
	if (__probable(code != 180))
		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
			BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
			error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);

	thread_exception_return();
	/* NOTREACHED */
}
int
msync_nocancel(__unused proc_t p, struct msync_nocancel_args *uap, __unused register_t *retval)
{
	mach_vm_offset_t addr;
	mach_vm_size_t size;
	int flags;
	vm_map_t user_map;
	int rv;
	vm_sync_t sync_flags=0;

	addr = (mach_vm_offset_t) uap->addr;
	size = (mach_vm_size_t)uap->len;

	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_msync) | DBG_FUNC_NONE), (uint32_t)(addr >> 32), (uint32_t)(size >> 32), 0, 0, 0);

	if (addr & PAGE_MASK_64) {
		/* UNIX SPEC: user address is not page-aligned, return EINVAL */
		return EINVAL;
	}
	if (size == 0) {
		/*
		 * We cannot support this properly without maintaining
		 * list all mmaps done. Cannot use vm_map_entry as they could be
		 * split or coalesced by indepenedant actions. So instead of 
		 * inaccurate results, lets just return error as invalid size
		 * specified
		 */
		return (EINVAL); /* XXX breaks posix apps */
	}

	flags = uap->flags;
	/* disallow contradictory flags */
	if ((flags & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))
		return (EINVAL);

	if (flags & MS_KILLPAGES)
	        sync_flags |= VM_SYNC_KILLPAGES;
	if (flags & MS_DEACTIVATE)
	        sync_flags |= VM_SYNC_DEACTIVATE;
	if (flags & MS_INVALIDATE)
	        sync_flags |= VM_SYNC_INVALIDATE;

	if ( !(flags & (MS_KILLPAGES | MS_DEACTIVATE))) {
	        if (flags & MS_ASYNC) 
		        sync_flags |= VM_SYNC_ASYNCHRONOUS;
		else 
		        sync_flags |= VM_SYNC_SYNCHRONOUS;
	}

	sync_flags |= VM_SYNC_CONTIGUOUS;	/* complain if holes */

	user_map = current_map();
	rv = mach_vm_msync(user_map, addr, size, sync_flags);

	switch (rv) {
	case KERN_SUCCESS:
		break;
	case KERN_INVALID_ADDRESS:	/* hole in region being sync'ed */
		return (ENOMEM);
	case KERN_FAILURE:
		return (EIO);
	default:
		return (EINVAL);
	}
	return (0);
}
/*
 * XXX Internally, we use VM_PROT_* somewhat interchangeably, but the correct
 * XXX usage is PROT_* from an interface perspective.  Thus the values of
 * XXX VM_PROT_* and PROT_* need to correspond.
 */
int
mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval)
{
	/*
	 *	Map in special device (must be SHARED) or file
	 */
	struct fileproc *fp;
	register struct		vnode *vp;
	int			flags;
	int			prot, file_prot;
	int			err=0;
	vm_map_t		user_map;
	kern_return_t		result;
	mach_vm_offset_t	user_addr;
	mach_vm_size_t		user_size;
	vm_object_offset_t	pageoff;
	vm_object_offset_t	file_pos;
	int			alloc_flags=0;
	boolean_t		docow;
	vm_prot_t		maxprot;
	void 			*handle;
	vm_pager_t		pager;
	int 			mapanon=0;
	int 			fpref=0;
	int error =0;
	int fd = uap->fd;

	user_addr = (mach_vm_offset_t)uap->addr;
	user_size = (mach_vm_size_t) uap->len;

	AUDIT_ARG(addr, user_addr);
	AUDIT_ARG(len, user_size);
	AUDIT_ARG(fd, uap->fd);

	prot = (uap->prot & VM_PROT_ALL);
#if 3777787
	/*
	 * Since the hardware currently does not support writing without
	 * read-before-write, or execution-without-read, if the request is
	 * for write or execute access, we must imply read access as well;
	 * otherwise programs expecting this to work will fail to operate.
	 */
	if (prot & (VM_PROT_EXECUTE | VM_PROT_WRITE))
		prot |= VM_PROT_READ;
#endif	/* radar 3777787 */

	flags = uap->flags;
	vp = NULLVP;

	/*
	 * The vm code does not have prototypes & compiler doesn't do the'
	 * the right thing when you cast 64bit value and pass it in function 
	 * call. So here it is.
	 */
	file_pos = (vm_object_offset_t)uap->pos;


	/* make sure mapping fits into numeric range etc */
	if (file_pos + user_size > (vm_object_offset_t)-PAGE_SIZE_64)
		return (EINVAL);

	/*
	 * Align the file position to a page boundary,
	 * and save its page offset component.
	 */
	pageoff = (file_pos & PAGE_MASK);
	file_pos -= (vm_object_offset_t)pageoff;


	/* Adjust size for rounding (on both ends). */
	user_size += pageoff;			/* low end... */
	user_size = mach_vm_round_page(user_size);	/* hi end */


	/*
	 * Check for illegal addresses.  Watch out for address wrap... Note
	 * that VM_*_ADDRESS are not constants due to casts (argh).
	 */
	if (flags & MAP_FIXED) {
		/*
		 * The specified address must have the same remainder
		 * as the file offset taken modulo PAGE_SIZE, so it
		 * should be aligned after adjustment by pageoff.
		 */
		user_addr -= pageoff;
		if (user_addr & PAGE_MASK)
		return (EINVAL);
	}
#ifdef notyet
	/* DO not have apis to get this info, need to wait till then*/
	/*
	 * XXX for non-fixed mappings where no hint is provided or
	 * the hint would fall in the potential heap space,
	 * place it after the end of the largest possible heap.
	 *
	 * There should really be a pmap call to determine a reasonable
	 * location.
	 */
	else if (addr < mach_vm_round_page(p->p_vmspace->vm_daddr + MAXDSIZ))
		addr = mach_vm_round_page(p->p_vmspace->vm_daddr + MAXDSIZ);

#endif

	alloc_flags = 0;

	if (flags & MAP_ANON) {
		/*
		 * Mapping blank space is trivial.  Use positive fds as the alias
		 * value for memory tracking. 
		 */
		if (fd != -1) {
			/*
			 * Use "fd" to pass (some) Mach VM allocation flags,
			 * (see the VM_FLAGS_* definitions).
			 */
			alloc_flags = fd & (VM_FLAGS_ALIAS_MASK |
					    VM_FLAGS_PURGABLE);
			if (alloc_flags != fd) {
				/* reject if there are any extra flags */
				return EINVAL;
			}
		}
			
		handle = NULL;
		maxprot = VM_PROT_ALL;
		file_pos = 0;
		mapanon = 1;
	} else {
		struct vnode_attr va;
		vfs_context_t ctx = vfs_context_current();

		/*
		 * Mapping file, get fp for validation. Obtain vnode and make
		 * sure it is of appropriate type.
		 */
		err = fp_lookup(p, fd, &fp, 0);
		if (err)
			return(err);
		fpref = 1;
		if(fp->f_fglob->fg_type == DTYPE_PSXSHM) {
			uap->addr = (user_addr_t)user_addr;
			uap->len = (user_size_t)user_size;
			uap->prot = prot;
			uap->flags = flags;
			uap->pos = file_pos;
			error = pshm_mmap(p, uap, retval, fp, (off_t)pageoff);
			goto bad;
		}

		if (fp->f_fglob->fg_type != DTYPE_VNODE) {
			error = EINVAL;
			goto bad;
		}
		vp = (struct vnode *)fp->f_fglob->fg_data;
		error = vnode_getwithref(vp);
		if(error != 0)
			goto bad;

		if (vp->v_type != VREG && vp->v_type != VCHR) {
			(void)vnode_put(vp);
			error = EINVAL;
			goto bad;
		}

		AUDIT_ARG(vnpath, vp, ARG_VNODE1);
		
		/*
		 * POSIX: mmap needs to update access time for mapped files
		 */
		if ((vnode_vfsvisflags(vp) & MNT_NOATIME) == 0) {
			VATTR_INIT(&va);
			nanotime(&va.va_access_time);
			VATTR_SET_ACTIVE(&va, va_access_time);
			vnode_setattr(vp, &va, ctx);
		}
		
		/*
		 * XXX hack to handle use of /dev/zero to map anon memory (ala
		 * SunOS).
		 */
		if (vp->v_type == VCHR || vp->v_type == VSTR) {
			(void)vnode_put(vp);
			error = ENODEV;
			goto bad;
		} else {
			/*
			 * Ensure that file and memory protections are
			 * compatible.  Note that we only worry about
			 * writability if mapping is shared; in this case,
			 * current and max prot are dictated by the open file.
			 * XXX use the vnode instead?  Problem is: what
			 * credentials do we use for determination? What if
			 * proc does a setuid?
			 */
			maxprot = VM_PROT_EXECUTE;	/* ??? */
			if (fp->f_fglob->fg_flag & FREAD)
				maxprot |= VM_PROT_READ;
			else if (prot & PROT_READ) {
				(void)vnode_put(vp);
				error = EACCES;
				goto bad;
			}
			/*
			 * If we are sharing potential changes (either via
			 * MAP_SHARED or via the implicit sharing of character
			 * device mappings), and we are trying to get write
			 * permission although we opened it without asking
			 * for it, bail out. 
			 */

			if ((flags & MAP_SHARED) != 0) {
				if ((fp->f_fglob->fg_flag & FWRITE) != 0) {
 					/*
 					 * check for write access
 					 *
 					 * Note that we already made this check when granting FWRITE
 					 * against the file, so it seems redundant here.
 					 */
 					error = vnode_authorize(vp, NULL, KAUTH_VNODE_CHECKIMMUTABLE, ctx);
 
 					/* if not granted for any reason, but we wanted it, bad */
 					if ((prot & PROT_WRITE) && (error != 0)) {
 						vnode_put(vp);
  						goto bad;
  					}
 
 					/* if writable, remember */
 					if (error == 0)
  						maxprot |= VM_PROT_WRITE;

				} else if ((prot & PROT_WRITE) != 0) {
					(void)vnode_put(vp);
					error = EACCES;
					goto bad;
				}
			} else
				maxprot |= VM_PROT_WRITE;

			handle = (void *)vp;
#if CONFIG_MACF
			error = mac_file_check_mmap(vfs_context_ucred(ctx),
			    fp->f_fglob, prot, flags, &maxprot);
			if (error) {
				(void)vnode_put(vp);
				goto bad;
			}
#endif /* MAC */
		}
	}

	if (user_size == 0)  {
		if (!mapanon)
			(void)vnode_put(vp);
		error = 0;
		goto bad;
	}

	/*
	 *	We bend a little - round the start and end addresses
	 *	to the nearest page boundary.
	 */
	user_size = mach_vm_round_page(user_size);

	if (file_pos & PAGE_MASK_64) {
		if (!mapanon)
			(void)vnode_put(vp);
		error = EINVAL;
		goto bad;
	}

	user_map = current_map();

	if ((flags & MAP_FIXED) == 0) {
		alloc_flags |= VM_FLAGS_ANYWHERE;
		user_addr = mach_vm_round_page(user_addr);
	} else {
		if (user_addr != mach_vm_trunc_page(user_addr)) {
		        if (!mapanon)
			        (void)vnode_put(vp);
			error = EINVAL;
			goto bad;
		}
		/*
		 * mmap(MAP_FIXED) will replace any existing mappings in the
		 * specified range, if the new mapping is successful.
		 * If we just deallocate the specified address range here,
		 * another thread might jump in and allocate memory in that
		 * range before we get a chance to establish the new mapping,
		 * and we won't have a chance to restore the old mappings.
		 * So we use VM_FLAGS_OVERWRITE to let Mach VM know that it
		 * has to deallocate the existing mappings and establish the
		 * new ones atomically.
		 */
		alloc_flags |= VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE;
	}

	if (flags & MAP_NOCACHE)
		alloc_flags |= VM_FLAGS_NO_CACHE;

	/*
	 * Lookup/allocate object.
	 */
	if (handle == NULL) {
		pager = NULL;
#ifdef notyet
/* Hmm .. */
#if defined(VM_PROT_READ_IS_EXEC)
		if (prot & VM_PROT_READ)
			prot |= VM_PROT_EXECUTE;
		if (maxprot & VM_PROT_READ)
			maxprot |= VM_PROT_EXECUTE;
#endif
#endif

#if 3777787
		if (prot & (VM_PROT_EXECUTE | VM_PROT_WRITE))
			prot |= VM_PROT_READ;
		if (maxprot & (VM_PROT_EXECUTE | VM_PROT_WRITE))
			maxprot |= VM_PROT_READ;
#endif	/* radar 3777787 */

		result = vm_map_enter_mem_object(user_map,
						 &user_addr, user_size,
						 0, alloc_flags,
						 IPC_PORT_NULL, 0, FALSE,
						 prot, maxprot,
						 (flags & MAP_SHARED) ?
						 VM_INHERIT_SHARE : 
						 VM_INHERIT_DEFAULT);
		if (result != KERN_SUCCESS) 
				goto out;
	} else {
		pager = (vm_pager_t)ubc_getpager(vp);
		
		if (pager == NULL) {
			(void)vnode_put(vp);
			error = ENOMEM;
			goto bad;
		}

		/*
		 *  Set credentials:
		 *	FIXME: if we're writing the file we need a way to
		 *      ensure that someone doesn't replace our R/W creds
		 * 	with ones that only work for read.
		 */

		ubc_setthreadcred(vp, p, current_thread());
		docow = FALSE;
		if ((flags & (MAP_ANON|MAP_SHARED)) == 0) {
			docow = TRUE;
		}

#ifdef notyet
/* Hmm .. */
#if defined(VM_PROT_READ_IS_EXEC)
		if (prot & VM_PROT_READ)
			prot |= VM_PROT_EXECUTE;
		if (maxprot & VM_PROT_READ)
			maxprot |= VM_PROT_EXECUTE;
#endif
#endif /* notyet */

#if 3777787
		if (prot & (VM_PROT_EXECUTE | VM_PROT_WRITE))
			prot |= VM_PROT_READ;
		if (maxprot & (VM_PROT_EXECUTE | VM_PROT_WRITE))
			maxprot |= VM_PROT_READ;
#endif	/* radar 3777787 */

		result = vm_map_enter_mem_object(user_map,
						 &user_addr, user_size,
						 0, alloc_flags,
						 (ipc_port_t)pager, file_pos,
						 docow, prot, maxprot, 
						 (flags & MAP_SHARED) ?
						 VM_INHERIT_SHARE : 
						 VM_INHERIT_DEFAULT);

		if (result != KERN_SUCCESS)  {
				(void)vnode_put(vp);
				goto out;
		}

		file_prot = prot & (PROT_READ | PROT_WRITE | PROT_EXEC);
		if (docow) {
			/* private mapping: won't write to the file */
			file_prot &= ~PROT_WRITE;
		}
		(void) ubc_map(vp, file_prot);
	}

	if (!mapanon)
		(void)vnode_put(vp);

out:
	switch (result) {
	case KERN_SUCCESS:
		*retval = user_addr + pageoff;
		error = 0;
		break;
	case KERN_INVALID_ADDRESS:
	case KERN_NO_SPACE:
		error =  ENOMEM;
		break;
	case KERN_PROTECTION_FAILURE:
		error =  EACCES;
		break;
	default:
		error =  EINVAL;
		break;
	}
bad:
	if (fpref)
		fp_drop(p, fd, fp, 0);

	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_mmap) | DBG_FUNC_NONE), fd, (uint32_t)(*retval), (uint32_t)user_size, error, 0);
	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO2, SYS_mmap) | DBG_FUNC_NONE), (uint32_t)(*retval >> 32), (uint32_t)(user_size >> 32),
			      (uint32_t)(file_pos >> 32), (uint32_t)file_pos, 0);

	return(error);
}
Beispiel #8
0
void
unix_syscall(struct i386_saved_state *regs)
{
    thread_act_t		thread;
    void	*vt; 
    unsigned short	code;
    struct sysent		*callp;
	int	nargs, error;
	volatile int *rval;
	int funnel_type;
    vm_offset_t		params;
    extern int nsysent;
	struct proc *p;
	struct proc *current_proc();

    thread = current_act();
    p = current_proc();
    rval = (int *)get_bsduthreadrval(thread);

    //printf("[scall : eax %x]",  regs->eax);
    code = regs->eax;
    params = (vm_offset_t) ((caddr_t)regs->uesp + sizeof (int));
    callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
    if (callp == sysent) {
	code = fuword(params);
	params += sizeof (int);
	callp = (code >= nsysent) ? &sysent[63] : &sysent[code];
    }
    
    vt = get_bsduthreadarg(thread);

    if ((nargs = (callp->sy_narg * sizeof (int))) &&
	    (error = copyin((char *) params, (char *)vt , nargs)) != 0) {
	regs->eax = error;
	regs->efl |= EFL_CF;
	thread_exception_return();
	/* NOTREACHED */
    }
    
    rval[0] = 0;
    rval[1] = regs->edx;

	funnel_type = callp->sy_funnel;
	if(funnel_type == KERNEL_FUNNEL)
		(void) thread_funnel_set(kernel_flock, TRUE);
	else if (funnel_type == NETWORK_FUNNEL)
		(void) thread_funnel_set(network_flock, TRUE);
	
   set_bsduthreadargs(thread, regs, NULL);

    if (callp->sy_narg > 8)
	panic("unix_syscall max arg count exceeded (%d)", callp->sy_narg);

	ktrsyscall(p, code, callp->sy_narg, vt, funnel_type);

	{ 
	  int *ip = (int *)vt;
	  KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
	      *ip, *(ip+1), *(ip+2), *(ip+3), 0);
	}

    error = (*(callp->sy_call))(p, (void *) vt, (int *) &rval[0]);
	
#if 0
	/* May be needed with vfork changes */
	regs = USER_REGS(thread);
#endif
	if (error == ERESTART) {
		regs->eip -= 7;
	}
	else if (error != EJUSTRETURN) {
		if (error) {
		    regs->eax = error;
		    regs->efl |= EFL_CF;	/* carry bit */
		} else { /* (not error) */
		    regs->eax = rval[0];
		    regs->edx = rval[1];
		    regs->efl &= ~EFL_CF;
		} 
	}

	ktrsysret(p, code, error, rval[0], funnel_type);

	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
		error, rval[0], rval[1], 0, 0);

	if(funnel_type != NO_FUNNEL)
    		(void) thread_funnel_set(current_thread()->funnel_lock, FALSE);

    thread_exception_return();
    /* NOTREACHED */
}
Beispiel #9
0
int
ptrace(struct proc *p, struct ptrace_args *uap, int32_t *retval)
{
	struct proc *t = current_proc();	/* target process */
	task_t		task;
	thread_t	th_act;
	struct uthread 	*ut;
	int tr_sigexc = 0;
	int error = 0;
	int stopped = 0;

	AUDIT_ARG(cmd, uap->req);
	AUDIT_ARG(pid, uap->pid);
	AUDIT_ARG(addr, uap->addr);
	AUDIT_ARG(value32, uap->data);

	if (uap->req == PT_DENY_ATTACH) {
		proc_lock(p);
		if (ISSET(p->p_lflag, P_LTRACED)) {
			proc_unlock(p);
			KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_FRCEXIT) | DBG_FUNC_NONE,
					      p->p_pid, W_EXITCODE(ENOTSUP, 0), 4, 0, 0);
			exit1(p, W_EXITCODE(ENOTSUP, 0), retval);

			thread_exception_return();
			/* NOTREACHED */
		}
		SET(p->p_lflag, P_LNOATTACH);
		proc_unlock(p);

		return(0);
	}

	if (uap->req == PT_FORCEQUOTA) {
		if (kauth_cred_issuser(kauth_cred_get())) {
			OSBitOrAtomic(P_FORCEQUOTA, &t->p_flag);
			return (0);
		} else
			return (EPERM);
	}

	/*
	 *	Intercept and deal with "please trace me" request.
	 */	 
	if (uap->req == PT_TRACE_ME) {
retry_trace_me:;
		proc_t pproc = proc_parent(p);
		if (pproc == NULL)
			return (EINVAL);
#if CONFIG_MACF
		/*
		 * NB: Cannot call kauth_authorize_process(..., KAUTH_PROCESS_CANTRACE, ...)
		 *     since that assumes the process being checked is the current process
		 *     when, in this case, it is the current process's parent.
		 *     Most of the other checks in cantrace() don't apply either.
		 */
		if ((error = mac_proc_check_debug(pproc, p)) == 0) {
#endif
			proc_lock(p);
			/* Make sure the process wasn't re-parented. */
			if (p->p_ppid != pproc->p_pid) {
				proc_unlock(p);
				proc_rele(pproc);
				goto retry_trace_me;
			}
			SET(p->p_lflag, P_LTRACED);
			/* Non-attached case, our tracer is our parent. */
			p->p_oppid = p->p_ppid;
			proc_unlock(p);
			/* Child and parent will have to be able to run modified code. */
			cs_allow_invalid(p);
			cs_allow_invalid(pproc);
#if CONFIG_MACF
		}
#endif
		proc_rele(pproc);
		return (error);
	}
	if (uap->req == PT_SIGEXC) {
		proc_lock(p);
		if (ISSET(p->p_lflag, P_LTRACED)) {
			SET(p->p_lflag, P_LSIGEXC);
			proc_unlock(p);
			return(0);
		} else {
			proc_unlock(p);
			return(EINVAL);
		}
	}

	/* 
	 * We do not want ptrace to do anything with kernel or launchd 
	 */
	if (uap->pid < 2) {
		return(EPERM);
	}

	/*
	 *	Locate victim, and make sure it is traceable.
	 */
	if ((t = proc_find(uap->pid)) == NULL)
			return (ESRCH);

	AUDIT_ARG(process, t);

	task = t->task;
	if (uap->req == PT_ATTACHEXC) {
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
		uap->req = PT_ATTACH;
		tr_sigexc = 1;
	}
	if (uap->req == PT_ATTACH) {
#pragma clang diagnostic pop
		int		err;


		if ( kauth_authorize_process(proc_ucred(p), KAUTH_PROCESS_CANTRACE, 
									 t, (uintptr_t)&err, 0, 0) == 0 ) {
			/* it's OK to attach */
			proc_lock(t);
			SET(t->p_lflag, P_LTRACED);
			if (tr_sigexc) 
				SET(t->p_lflag, P_LSIGEXC);
	
			t->p_oppid = t->p_ppid;
			/* Check whether child and parent are allowed to run modified
			 * code (they'll have to) */
			proc_unlock(t);
			cs_allow_invalid(t);
			cs_allow_invalid(p);
			if (t->p_pptr != p)
				proc_reparentlocked(t, p, 1, 0);
	
			proc_lock(t);
			if (get_task_userstop(task) > 0 ) {
				stopped = 1;
			}
			t->p_xstat = 0;
			proc_unlock(t);
			psignal(t, SIGSTOP);
			/*
			 * If the process was stopped, wake up and run through
			 * issignal() again to properly connect to the tracing
			 * process.
			 */
			if (stopped)
				task_resume(task);       
			error = 0;
			goto out;
		}
		else {
			/* not allowed to attach, proper error code returned by kauth_authorize_process */
			if (ISSET(t->p_lflag, P_LNOATTACH)) {
				psignal(p, SIGSEGV);
			}
			
			error = err;
			goto out;
		}
	}

	/*
	 * You can't do what you want to the process if:
	 *	(1) It's not being traced at all,
	 */
	proc_lock(t);
	if (!ISSET(t->p_lflag, P_LTRACED)) {
		proc_unlock(t);
		error = EPERM;
		goto out;
	}

	/*
	 *	(2) it's not being traced by _you_, or
	 */
	if (t->p_pptr != p) {
		proc_unlock(t);
		error = EBUSY;
		goto out;
	}

	/*
	 *	(3) it's not currently stopped.
	 */
	if (t->p_stat != SSTOP) {
		proc_unlock(t);
		error = EBUSY;
		goto out;
	}

	/*
	 *	Mach version of ptrace executes request directly here,
	 *	thus simplifying the interaction of ptrace and signals.
	 */
	/* proc lock is held here */
	switch (uap->req) {

	case PT_DETACH:
		if (t->p_oppid != t->p_ppid) {
			struct proc *pp;

			proc_unlock(t);
			pp = proc_find(t->p_oppid);
			if (pp != PROC_NULL) {
				proc_reparentlocked(t, pp, 1, 0);
				proc_rele(pp);
			} else {
				/* original parent exited while traced */
				proc_list_lock();
				t->p_listflag |= P_LIST_DEADPARENT;
				proc_list_unlock();
				proc_reparentlocked(t, initproc, 1, 0);
			}
			proc_lock(t);
		}

		t->p_oppid = 0;
		CLR(t->p_lflag, P_LTRACED);
		CLR(t->p_lflag, P_LSIGEXC);
		proc_unlock(t);
		goto resume;
		
	case PT_KILL:
		/*
		 *	Tell child process to kill itself after it
		 *	is resumed by adding NSIG to p_cursig. [see issig]
		 */
		proc_unlock(t);
#if CONFIG_MACF
		error = mac_proc_check_signal(p, t, SIGKILL);
		if (0 != error)
			goto resume;
#endif
		psignal(t, SIGKILL);
		goto resume;

	case PT_STEP:			/* single step the child */
	case PT_CONTINUE:		/* continue the child */
		proc_unlock(t);
		th_act = (thread_t)get_firstthread(task);
		if (th_act == THREAD_NULL) {
			error = EINVAL;
			goto out;
		}

		/* force use of Mach SPIs (and task_for_pid security checks) to adjust PC */
		if (uap->addr != (user_addr_t)1) {
			error = ENOTSUP;
			goto out;
		}

		if ((unsigned)uap->data >= NSIG) {
			error = EINVAL;
			goto out;
		}

		if (uap->data != 0) {
#if CONFIG_MACF
			error = mac_proc_check_signal(p, t, uap->data);
			if (0 != error)
				goto out;
#endif
			psignal(t, uap->data);
		}

		if (uap->req == PT_STEP) {
		        /*
			 * set trace bit 
			 * we use sending SIGSTOP as a comparable security check.
			 */
#if CONFIG_MACF
			error = mac_proc_check_signal(p, t, SIGSTOP);
			if (0 != error) {
				goto out;
			}
#endif
			if (thread_setsinglestep(th_act, 1) != KERN_SUCCESS) {
				error = ENOTSUP;
				goto out;
			}
		} else {
		        /*
			 * clear trace bit if on
			 * we use sending SIGCONT as a comparable security check.
			 */
#if CONFIG_MACF
			error = mac_proc_check_signal(p, t, SIGCONT);
			if (0 != error) {
				goto out;
			}
#endif
			if (thread_setsinglestep(th_act, 0) != KERN_SUCCESS) {
				error = ENOTSUP;
				goto out;
			}
		}	
	resume:
		proc_lock(t);
		t->p_xstat = uap->data;
		t->p_stat = SRUN;
		if (t->sigwait) {
			wakeup((caddr_t)&(t->sigwait));
			proc_unlock(t);
			if ((t->p_lflag & P_LSIGEXC) == 0) {
				task_resume(task);
			}
		} else
			proc_unlock(t);
			
		break;
		
	case PT_THUPDATE:  {
		proc_unlock(t);
		if ((unsigned)uap->data >= NSIG) {
			error = EINVAL;
			goto out;
		}
		th_act = port_name_to_thread(CAST_MACH_PORT_TO_NAME(uap->addr));
		if (th_act == THREAD_NULL) {
			error = ESRCH;
			goto out;
		}
		ut = (uthread_t)get_bsdthread_info(th_act);
		if (uap->data)
			ut->uu_siglist |= sigmask(uap->data);
		proc_lock(t);
		t->p_xstat = uap->data;
		t->p_stat = SRUN;
		proc_unlock(t);
		thread_deallocate(th_act);
		error = 0;
		}
		break;
	default:
		proc_unlock(t);
		error = EINVAL;
		goto out;
	}

	error = 0;
out:
	proc_rele(t);
	return(error);
}