Пример #1
0
int
vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
	      vm_offset_t *offset, struct vm_object **object)
{
	int i;
	size_t seg_len;
	vm_paddr_t seg_gpa;
	vm_object_t seg_obj;

	for (i = 0; i < vm->num_mem_segs; i++) {
		if ((seg_obj = vm->mem_segs[i].object) == NULL)
			continue;

		seg_gpa = vm->mem_segs[i].gpa;
		seg_len = vm->mem_segs[i].len;

		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
			*offset = gpa - seg_gpa;
			*object = seg_obj;
			vm_object_reference(seg_obj);
			return (0);
		}
	}

	return (EINVAL);
}
Пример #2
0
/*
 * MPSAFE
 */
static vm_object_t
phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
		 vm_ooffset_t foff)
{
	vm_object_t object;

	/*
	 * Offset should be page aligned.
	 */
	if (foff & PAGE_MASK)
		return (NULL);

	size = round_page(size);

	if (handle != NULL) {
		mtx_lock(&Giant);
		/*
		 * Lock to prevent object creation race condition.
		 */
		while (phys_pager_alloc_lock) {
			phys_pager_alloc_lock = -1;
			tsleep(&phys_pager_alloc_lock, PVM, "swpalc", 0);
		}
		phys_pager_alloc_lock = 1;

		/*
		 * Look up pager, creating as necessary.
		 */
		object = vm_pager_object_lookup(&phys_pager_object_list, handle);
		if (object == NULL) {
			/*
			 * Allocate object and associate it with the pager.
			 */
			object = vm_object_allocate(OBJT_PHYS,
				OFF_TO_IDX(foff + size));
			object->handle = handle;
			mtx_lock(&phys_pager_mtx);
			TAILQ_INSERT_TAIL(&phys_pager_object_list, object,
			    pager_object_list);
			mtx_unlock(&phys_pager_mtx);
		} else {
			/*
			 * Gain a reference to the object.
			 */
			vm_object_reference(object);
			if (OFF_TO_IDX(foff + size) > object->size)
				object->size = OFF_TO_IDX(foff + size);
		}
		if (phys_pager_alloc_lock == -1)
			wakeup(&phys_pager_alloc_lock);
		phys_pager_alloc_lock = 0;
		mtx_unlock(&Giant);
	} else {
		object = vm_object_allocate(OBJT_PHYS,
			OFF_TO_IDX(foff + size));
	}

	return (object);
}
Пример #3
0
/*
 * mmap() helper to validate mmap() requests against shm object state
 * and give mmap() the vm_object to use for the mapping.
 */
int
shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff,
    vm_object_t *obj)
{

	/*
	 * XXXRW: This validation is probably insufficient, and subject to
	 * sign errors.  It should be fixed.
	 */
	if (foff >= shmfd->shm_size ||
	    foff + objsize > round_page(shmfd->shm_size))
		return (EINVAL);

	mtx_lock(&shm_timestamp_lock);
	vfs_timestamp(&shmfd->shm_atime);
	mtx_unlock(&shm_timestamp_lock);
	vm_object_reference(shmfd->shm_object);
	*obj = shmfd->shm_object;
	return (0);
}
Пример #4
0
void
kmem_submap(
	vm_map_t 	map, 
	vm_map_t 	parent,
	vm_offset_t 	*min, 
	vm_offset_t 	*max,
	vm_size_t 	size,
	boolean_t 	pageable)
{
	vm_offset_t addr;
	kern_return_t kr;

	size = round_page(size);

	/*
	 *	Need reference on submap object because it is internal
	 *	to the vm_system.  vm_object_enter will never be called
	 *	on it (usual source of reference for vm_map_enter).
	 */
	vm_object_reference(vm_submap_object);

	addr = vm_map_min(parent);
	kr = vm_map_enter(parent, &addr, size,
			  (vm_offset_t) 0, TRUE,
			  vm_submap_object, (vm_offset_t) 0, FALSE,
			  VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
	if (kr != KERN_SUCCESS)
		panic("kmem_submap");

	pmap_reference(vm_map_pmap(parent));
	vm_map_setup(map, vm_map_pmap(parent), addr, addr + size, pageable);
	kr = vm_map_submap(parent, addr, addr + size, map);
	if (kr != KERN_SUCCESS)
		panic("kmem_submap");

	*min = addr;
	*max = addr + size;
}
Пример #5
0
/**
 * Get the VM object representing a given memory mapping of the compositor. This
 * gets or allocates a CFB pool corresponding to the FD being used to perform
 * the user's mmap() call. If a new FD is mmap()ped, a new CFB pool is allocated
 * and returned. If the same FD is mmap()ped again, the same CFB pool is
 * returned. Each vm_object corresponds directly with a CFB pool.
 *
 * offset is a guaranteed-page-aligned offset into the FD requested by the user
 * in their call to mmap(). We may modify it.
 * size is a guaranteed-page-rounded size for the mapping as requested by the
 * user in their call to mmap().
 */
static int
cheri_compositor_cfb_mmap_single(struct cdev *dev, vm_ooffset_t *offset,
    vm_size_t size, struct vm_object **obj_res, int nprot)
{
	struct cheri_compositor_softc *sc;
	struct cfb_vm_object *cfb_vm_obj;
	struct vm_object *vm_obj = NULL;
	struct file *cdev_fd;
	struct compositor_cfb_pool *cfb_pool;
	int error;

	sc = dev->si_drv1;

	error = 0;

	CHERI_COMPOSITOR_DEBUG(sc,
	    "dev: %p, offset: %lu, size: %lu, nprot: %i", dev, *offset, size,
	    nprot);

	cdev_fd = curthread->td_fpop;
	KASSERT(cdev_fd != NULL, ("mmap_single td_fpop == NULL"));

	CHERI_COMPOSITOR_DEBUG(sc, "cdev_fd: %p", cdev_fd);

	/* Allocate a CFB VM object to associate the cdev with the CFB pool
	 * mapping. Note: The ordering here is fairly sensitive to changes, as
	 * the cdev_pager_allocate() call results in sub-calls to
	 * cheri_compositor_cfb_pg_fault(), which assumes various fields in the
	 * CFB VM object have been initialised.
	 *
	 * The CFB VM object gets destroyed in
	 * cheri_compositor_cfb_pg_dtor(). */
	cfb_vm_obj =
	    malloc(sizeof(*cfb_vm_obj), M_CHERI_COMPOSITOR, M_WAITOK | M_ZERO);

	CHERI_COMPOSITOR_LOCK(sc);

	/* Find/Allocate a pool mapping for this FD. */
	if (dup_or_allocate_cfb_pool_for_cdev_fd(sc, cdev_fd,
	    NULL /* set later */, &cfb_pool) != 0) {
		free(cfb_vm_obj, M_CHERI_COMPOSITOR);
		error = ENOMEM;
		goto done;
	}

	/* Update the CFB VM object with the pool mapping and cdev. These have
	 * both been referenced, and the references are transferred to the CFB
	 * VM object. */
	cfb_vm_obj->dev = dev;
	cfb_vm_obj->pool = cfb_pool;

	/* If a pool had already been allocated for this FD, re-use it. */
	if (cfb_pool->vm_obj != NULL) {
		vm_object_reference(cfb_pool->vm_obj);
		vm_obj = cfb_pool->vm_obj;
		goto done;
	}

	/* Allocate a device pager VM object. */
	vm_obj = cdev_pager_allocate(cfb_vm_obj, OBJT_DEVICE,
	    &cheri_compositor_cfb_pager_ops, size, nprot,
	    *offset, curthread->td_ucred);

	if (vm_obj == NULL) {
		CHERI_COMPOSITOR_UNLOCK(sc);
		cheri_compositor_cfb_pg_dtor(cfb_vm_obj);
		error = EINVAL;
		goto done_unlocked;
	}

	/* Update the CFB pool to store the VM object. Transfer the reference
	 * from allocation. */
	cfb_pool->vm_obj = vm_obj;

done:
	CHERI_COMPOSITOR_UNLOCK(sc);
done_unlocked:
	CHERI_COMPOSITOR_DEBUG(sc,
	    "Finished with vm_obj: %p, cfb_pool: %p (retval: %u).",
	    vm_obj, cfb_pool, error);

	*obj_res = vm_obj;

	/* Don't need to modify the offset. It was originally passed by the user
	 * as an offset from the start of the cdev FD. Since the cdev FD maps
	 * directly to a CFB pool/VM object, the offset becomes an offset from
	 * the start of the CFB pool/VM object. */

	return (error);
}
Пример #6
0
kern_return_t
kmem_alloc_contig(
	vm_map_t		map,
	vm_offset_t		*addrp,
	vm_size_t		size,
	vm_offset_t 		mask,
	ppnum_t			max_pnum,
	ppnum_t			pnum_mask,
	int 			flags)
{
	vm_object_t		object;
	vm_object_offset_t	offset;
	vm_map_offset_t		map_addr; 
	vm_map_offset_t		map_mask;
	vm_map_size_t		map_size, i;
	vm_map_entry_t		entry;
	vm_page_t		m, pages;
	kern_return_t		kr;

	if (map == VM_MAP_NULL || (flags & ~(KMA_KOBJECT | KMA_LOMEM | KMA_NOPAGEWAIT))) 
		return KERN_INVALID_ARGUMENT;
	
	if (size == 0) {
		*addrp = 0;
		return KERN_INVALID_ARGUMENT;
	}

	map_size = vm_map_round_page(size);
	map_mask = (vm_map_offset_t)mask;

	/*
	 *	Allocate a new object (if necessary) and the reference we
	 *	will be donating to the map entry.  We must do this before
	 *	locking the map, or risk deadlock with the default pager.
	 */
	if ((flags & KMA_KOBJECT) != 0) {
		object = kernel_object;
		vm_object_reference(object);
	} else {
		object = vm_object_allocate(map_size);
	}

	kr = vm_map_find_space(map, &map_addr, map_size, map_mask, 0, &entry);
	if (KERN_SUCCESS != kr) {
		vm_object_deallocate(object);
		return kr;
	}

	entry->object.vm_object = object;
	entry->offset = offset = (object == kernel_object) ? 
		        map_addr : 0;

	/* Take an extra object ref in case the map entry gets deleted */
	vm_object_reference(object);
	vm_map_unlock(map);

	kr = cpm_allocate(CAST_DOWN(vm_size_t, map_size), &pages, max_pnum, pnum_mask, FALSE, flags);

	if (kr != KERN_SUCCESS) {
		vm_map_remove(map, vm_map_trunc_page(map_addr),
			      vm_map_round_page(map_addr + map_size), 0);
		vm_object_deallocate(object);
		*addrp = 0;
		return kr;
	}

	vm_object_lock(object);
	for (i = 0; i < map_size; i += PAGE_SIZE) {
		m = pages;
		pages = NEXT_PAGE(m);
		*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
		m->busy = FALSE;
		vm_page_insert(m, object, offset + i);
	}
	vm_object_unlock(object);

	if ((kr = vm_map_wire(map, vm_map_trunc_page(map_addr),
			      vm_map_round_page(map_addr + map_size), VM_PROT_DEFAULT, FALSE)) 
		!= KERN_SUCCESS) {
		if (object == kernel_object) {
			vm_object_lock(object);
			vm_object_page_remove(object, offset, offset + map_size);
			vm_object_unlock(object);
		}
		vm_map_remove(map, vm_map_trunc_page(map_addr), 
			      vm_map_round_page(map_addr + map_size), 0);
		vm_object_deallocate(object);
		return kr;
	}
	vm_object_deallocate(object);

	if (object == kernel_object)
		vm_map_simplify(map, map_addr);

	*addrp = (vm_offset_t) map_addr;
	assert((vm_map_offset_t) *addrp == map_addr);
	return KERN_SUCCESS;
}
Пример #7
0
/*
 * In-kernel implementation of execve().  All arguments are assumed to be
 * userspace pointers from the passed thread.
 */
static int
do_execve(struct thread *td, struct image_args *args, struct mac *mac_p)
{
	struct proc *p = td->td_proc;
	struct nameidata nd;
	struct ucred *oldcred;
	struct uidinfo *euip = NULL;
	register_t *stack_base;
	int error, i;
	struct image_params image_params, *imgp;
	struct vattr attr;
	int (*img_first)(struct image_params *);
	struct pargs *oldargs = NULL, *newargs = NULL;
	struct sigacts *oldsigacts = NULL, *newsigacts = NULL;
#ifdef KTRACE
	struct vnode *tracevp = NULL;
	struct ucred *tracecred = NULL;
#endif
	struct vnode *oldtextvp = NULL, *newtextvp;
	int credential_changing;
	int textset;
#ifdef MAC
	struct label *interpvplabel = NULL;
	int will_transition;
#endif
#ifdef HWPMC_HOOKS
	struct pmckern_procexec pe;
#endif
	static const char fexecv_proc_title[] = "(fexecv)";

	imgp = &image_params;

	/*
	 * Lock the process and set the P_INEXEC flag to indicate that
	 * it should be left alone until we're done here.  This is
	 * necessary to avoid race conditions - e.g. in ptrace() -
	 * that might allow a local user to illicitly obtain elevated
	 * privileges.
	 */
	PROC_LOCK(p);
	KASSERT((p->p_flag & P_INEXEC) == 0,
	    ("%s(): process already has P_INEXEC flag", __func__));
	p->p_flag |= P_INEXEC;
	PROC_UNLOCK(p);

	/*
	 * Initialize part of the common data
	 */
	bzero(imgp, sizeof(*imgp));
	imgp->proc = p;
	imgp->attr = &attr;
	imgp->args = args;
	oldcred = p->p_ucred;

#ifdef MAC
	error = mac_execve_enter(imgp, mac_p);
	if (error)
		goto exec_fail;
#endif

	/*
	 * Translate the file name. namei() returns a vnode pointer
	 *	in ni_vp among other things.
	 *
	 * XXXAUDIT: It would be desirable to also audit the name of the
	 * interpreter if this is an interpreted binary.
	 */
	if (args->fname != NULL) {
		NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
		    | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
	}

	SDT_PROBE1(proc, , , exec, args->fname);

interpret:
	if (args->fname != NULL) {
#ifdef CAPABILITY_MODE
		/*
		 * While capability mode can't reach this point via direct
		 * path arguments to execve(), we also don't allow
		 * interpreters to be used in capability mode (for now).
		 * Catch indirect lookups and return a permissions error.
		 */
		if (IN_CAPABILITY_MODE(td)) {
			error = ECAPMODE;
			goto exec_fail;
		}
#endif
		error = namei(&nd);
		if (error)
			goto exec_fail;

		newtextvp = nd.ni_vp;
		imgp->vp = newtextvp;
	} else {
		AUDIT_ARG_FD(args->fd);
		/*
		 * Descriptors opened only with O_EXEC or O_RDONLY are allowed.
		 */
		error = fgetvp_exec(td, args->fd, &cap_fexecve_rights, &newtextvp);
		if (error)
			goto exec_fail;
		vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY);
		AUDIT_ARG_VNODE1(newtextvp);
		imgp->vp = newtextvp;
	}

	/*
	 * Check file permissions (also 'opens' file)
	 */
	error = exec_check_permissions(imgp);
	if (error)
		goto exec_fail_dealloc;

	imgp->object = imgp->vp->v_object;
	if (imgp->object != NULL)
		vm_object_reference(imgp->object);

	/*
	 * Set VV_TEXT now so no one can write to the executable while we're
	 * activating it.
	 *
	 * Remember if this was set before and unset it in case this is not
	 * actually an executable image.
	 */
	textset = VOP_IS_TEXT(imgp->vp);
	VOP_SET_TEXT(imgp->vp);

	error = exec_map_first_page(imgp);
	if (error)
		goto exec_fail_dealloc;

	imgp->proc->p_osrel = 0;
	imgp->proc->p_fctl0 = 0;

	/*
	 * Implement image setuid/setgid.
	 *
	 * Determine new credentials before attempting image activators
	 * so that it can be used by process_exec handlers to determine
	 * credential/setid changes.
	 *
	 * Don't honor setuid/setgid if the filesystem prohibits it or if
	 * the process is being traced.
	 *
	 * We disable setuid/setgid/etc in capability mode on the basis
	 * that most setugid applications are not written with that
	 * environment in mind, and will therefore almost certainly operate
	 * incorrectly. In principle there's no reason that setugid
	 * applications might not be useful in capability mode, so we may want
	 * to reconsider this conservative design choice in the future.
	 *
	 * XXXMAC: For the time being, use NOSUID to also prohibit
	 * transitions on the file system.
	 */
	credential_changing = 0;
	credential_changing |= (attr.va_mode & S_ISUID) &&
	    oldcred->cr_uid != attr.va_uid;
	credential_changing |= (attr.va_mode & S_ISGID) &&
	    oldcred->cr_gid != attr.va_gid;
#ifdef MAC
	will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
	    interpvplabel, imgp);
	credential_changing |= will_transition;
#endif

	/* Don't inherit PROC_PDEATHSIG_CTL value if setuid/setgid. */
	if (credential_changing)
		imgp->proc->p_pdeathsig = 0;

	if (credential_changing &&
#ifdef CAPABILITY_MODE
	    ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
#endif
	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
	    (p->p_flag & P_TRACED) == 0) {
		imgp->credential_setid = true;
		VOP_UNLOCK(imgp->vp, 0);
		imgp->newcred = crdup(oldcred);
		if (attr.va_mode & S_ISUID) {
			euip = uifind(attr.va_uid);
			change_euid(imgp->newcred, euip);
		}
		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
		if (attr.va_mode & S_ISGID)
			change_egid(imgp->newcred, attr.va_gid);
		/*
		 * Implement correct POSIX saved-id behavior.
		 *
		 * XXXMAC: Note that the current logic will save the
		 * uid and gid if a MAC domain transition occurs, even
		 * though maybe it shouldn't.
		 */
		change_svuid(imgp->newcred, imgp->newcred->cr_uid);
		change_svgid(imgp->newcred, imgp->newcred->cr_gid);
	} else {
		/*
		 * Implement correct POSIX saved-id behavior.
		 *
		 * XXX: It's not clear that the existing behavior is
		 * POSIX-compliant.  A number of sources indicate that the
		 * saved uid/gid should only be updated if the new ruid is
		 * not equal to the old ruid, or the new euid is not equal
		 * to the old euid and the new euid is not equal to the old
		 * ruid.  The FreeBSD code always updates the saved uid/gid.
		 * Also, this code uses the new (replaced) euid and egid as
		 * the source, which may or may not be the right ones to use.
		 */
		if (oldcred->cr_svuid != oldcred->cr_uid ||
		    oldcred->cr_svgid != oldcred->cr_gid) {
			VOP_UNLOCK(imgp->vp, 0);
			imgp->newcred = crdup(oldcred);
			vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
			change_svuid(imgp->newcred, imgp->newcred->cr_uid);
			change_svgid(imgp->newcred, imgp->newcred->cr_gid);
		}
	}
	/* The new credentials are installed into the process later. */

	/*
	 * Do the best to calculate the full path to the image file.
	 */
	if (args->fname != NULL && args->fname[0] == '/')
		imgp->execpath = args->fname;
	else {
		VOP_UNLOCK(imgp->vp, 0);
		if (vn_fullpath(td, imgp->vp, &imgp->execpath,
		    &imgp->freepath) != 0)
			imgp->execpath = args->fname;
		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
	}

	/*
	 *	If the current process has a special image activator it
	 *	wants to try first, call it.   For example, emulating shell
	 *	scripts differently.
	 */
	error = -1;
	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
		error = img_first(imgp);

	/*
	 *	Loop through the list of image activators, calling each one.
	 *	An activator returns -1 if there is no match, 0 on success,
	 *	and an error otherwise.
	 */
	for (i = 0; error == -1 && execsw[i]; ++i) {
		if (execsw[i]->ex_imgact == NULL ||
		    execsw[i]->ex_imgact == img_first) {
			continue;
		}
		error = (*execsw[i]->ex_imgact)(imgp);
	}

	if (error) {
		if (error == -1) {
			if (textset == 0)
				VOP_UNSET_TEXT(imgp->vp);
			error = ENOEXEC;
		}
		goto exec_fail_dealloc;
	}

	/*
	 * Special interpreter operation, cleanup and loop up to try to
	 * activate the interpreter.
	 */
	if (imgp->interpreted) {
		exec_unmap_first_page(imgp);
		/*
		 * VV_TEXT needs to be unset for scripts.  There is a short
		 * period before we determine that something is a script where
		 * VV_TEXT will be set. The vnode lock is held over this
		 * entire period so nothing should illegitimately be blocked.
		 */
		VOP_UNSET_TEXT(imgp->vp);
		/* free name buffer and old vnode */
		if (args->fname != NULL)
			NDFREE(&nd, NDF_ONLY_PNBUF);
#ifdef MAC
		mac_execve_interpreter_enter(newtextvp, &interpvplabel);
#endif
		if (imgp->opened) {
			VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td);
			imgp->opened = 0;
		}
		vput(newtextvp);
		vm_object_deallocate(imgp->object);
		imgp->object = NULL;
		imgp->credential_setid = false;
		if (imgp->newcred != NULL) {
			crfree(imgp->newcred);
			imgp->newcred = NULL;
		}
		imgp->execpath = NULL;
		free(imgp->freepath, M_TEMP);
		imgp->freepath = NULL;
		/* set new name to that of the interpreter */
		NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
		    UIO_SYSSPACE, imgp->interpreter_name, td);
		args->fname = imgp->interpreter_name;
		goto interpret;
	}

	/*
	 * NB: We unlock the vnode here because it is believed that none
	 * of the sv_copyout_strings/sv_fixup operations require the vnode.
	 */
	VOP_UNLOCK(imgp->vp, 0);

	if (disallow_high_osrel &&
	    P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) {
		error = ENOEXEC;
		uprintf("Osrel %d for image %s too high\n", p->p_osrel,
		    imgp->execpath != NULL ? imgp->execpath : "<unresolved>");
		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
		goto exec_fail_dealloc;
	}

	/* ABI enforces the use of Capsicum. Switch into capabilities mode. */
	if (SV_PROC_FLAG(p, SV_CAPSICUM))
		sys_cap_enter(td, NULL);

	/*
	 * Copy out strings (args and env) and initialize stack base.
	 */
	stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);

	/*
	 * Stack setup.
	 */
	error = (*p->p_sysent->sv_fixup)(&stack_base, imgp);
	if (error != 0) {
		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
		goto exec_fail_dealloc;
	}

	if (args->fdp != NULL) {
		/* Install a brand new file descriptor table. */
		fdinstall_remapped(td, args->fdp);
		args->fdp = NULL;
	} else {
		/*
		 * Keep on using the existing file descriptor table. For
		 * security and other reasons, the file descriptor table
		 * cannot be shared after an exec.
		 */
		fdunshare(td);
		/* close files on exec */
		fdcloseexec(td);
	}

	/*
	 * Malloc things before we need locks.
	 */
	i = exec_args_get_begin_envv(imgp->args) - imgp->args->begin_argv;
	/* Cache arguments if they fit inside our allowance */
	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
		newargs = pargs_alloc(i);
		bcopy(imgp->args->begin_argv, newargs->ar_args, i);
	}

	/*
	 * For security and other reasons, signal handlers cannot
	 * be shared after an exec. The new process gets a copy of the old
	 * handlers. In execsigs(), the new process will have its signals
	 * reset.
	 */
	if (sigacts_shared(p->p_sigacts)) {
		oldsigacts = p->p_sigacts;
		newsigacts = sigacts_alloc();
		sigacts_copy(newsigacts, oldsigacts);
	}

	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);

	PROC_LOCK(p);
	if (oldsigacts)
		p->p_sigacts = newsigacts;
	/* Stop profiling */
	stopprofclock(p);

	/* reset caught signals */
	execsigs(p);

	/* name this process - nameiexec(p, ndp) */
	bzero(p->p_comm, sizeof(p->p_comm));
	if (args->fname)
		bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
		    min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
	else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0)
		bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
	bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
#ifdef KTR
	sched_clear_tdname(td);
#endif

	/*
	 * mark as execed, wakeup the process that vforked (if any) and tell
	 * it that it now has its own resources back
	 */
	p->p_flag |= P_EXEC;
	if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0)
		p->p_flag2 &= ~P2_NOTRACE;
	if (p->p_flag & P_PPWAIT) {
		p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
		cv_broadcast(&p->p_pwait);
		/* STOPs are no longer ignored, arrange for AST */
		signotify(td);
	}

	/*
	 * Implement image setuid/setgid installation.
	 */
	if (imgp->credential_setid) {
		/*
		 * Turn off syscall tracing for set-id programs, except for
		 * root.  Record any set-id flags first to make sure that
		 * we do not regain any tracing during a possible block.
		 */
		setsugid(p);

#ifdef KTRACE
		if (p->p_tracecred != NULL &&
		    priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED))
			ktrprocexec(p, &tracecred, &tracevp);
#endif
		/*
		 * Close any file descriptors 0..2 that reference procfs,
		 * then make sure file descriptors 0..2 are in use.
		 *
		 * Both fdsetugidsafety() and fdcheckstd() may call functions
		 * taking sleepable locks, so temporarily drop our locks.
		 */
		PROC_UNLOCK(p);
		VOP_UNLOCK(imgp->vp, 0);
		fdsetugidsafety(td);
		error = fdcheckstd(td);
		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
		if (error != 0)
			goto exec_fail_dealloc;
		PROC_LOCK(p);
#ifdef MAC
		if (will_transition) {
			mac_vnode_execve_transition(oldcred, imgp->newcred,
			    imgp->vp, interpvplabel, imgp);
		}
#endif
	} else {
		if (oldcred->cr_uid == oldcred->cr_ruid &&
		    oldcred->cr_gid == oldcred->cr_rgid)
			p->p_flag &= ~P_SUGID;
	}
	/*
	 * Set the new credentials.
	 */
	if (imgp->newcred != NULL) {
		proc_set_cred(p, imgp->newcred);
		crfree(oldcred);
		oldcred = NULL;
	}

	/*
	 * Store the vp for use in procfs.  This vnode was referenced by namei
	 * or fgetvp_exec.
	 */
	oldtextvp = p->p_textvp;
	p->p_textvp = newtextvp;

#ifdef KDTRACE_HOOKS
	/*
	 * Tell the DTrace fasttrap provider about the exec if it
	 * has declared an interest.
	 */
	if (dtrace_fasttrap_exec)
		dtrace_fasttrap_exec(p);
#endif

	/*
	 * Notify others that we exec'd, and clear the P_INEXEC flag
	 * as we're now a bona fide freshly-execed process.
	 */
	KNOTE_LOCKED(p->p_klist, NOTE_EXEC);
	p->p_flag &= ~P_INEXEC;

	/* clear "fork but no exec" flag, as we _are_ execing */
	p->p_acflag &= ~AFORK;

	/*
	 * Free any previous argument cache and replace it with
	 * the new argument cache, if any.
	 */
	oldargs = p->p_args;
	p->p_args = newargs;
	newargs = NULL;

	PROC_UNLOCK(p);

#ifdef	HWPMC_HOOKS
	/*
	 * Check if system-wide sampling is in effect or if the
	 * current process is using PMCs.  If so, do exec() time
	 * processing.  This processing needs to happen AFTER the
	 * P_INEXEC flag is cleared.
	 */
	if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
		VOP_UNLOCK(imgp->vp, 0);
		pe.pm_credentialschanged = credential_changing;
		pe.pm_entryaddr = imgp->entry_addr;

		PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
	}
#endif

	/* Set values passed into the program in registers. */
	(*p->p_sysent->sv_setregs)(td, imgp, (u_long)(uintptr_t)stack_base);

	vfs_mark_atime(imgp->vp, td->td_ucred);

	SDT_PROBE1(proc, , , exec__success, args->fname);

exec_fail_dealloc:
	if (imgp->firstpage != NULL)
		exec_unmap_first_page(imgp);

	if (imgp->vp != NULL) {
		if (args->fname)
			NDFREE(&nd, NDF_ONLY_PNBUF);
		if (imgp->opened)
			VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
		if (error != 0)
			vput(imgp->vp);
		else
			VOP_UNLOCK(imgp->vp, 0);
	}

	if (imgp->object != NULL)
		vm_object_deallocate(imgp->object);

	free(imgp->freepath, M_TEMP);

	if (error == 0) {
		if (p->p_ptevents & PTRACE_EXEC) {
			PROC_LOCK(p);
			if (p->p_ptevents & PTRACE_EXEC)
				td->td_dbgflags |= TDB_EXEC;
			PROC_UNLOCK(p);
		}

		/*
		 * Stop the process here if its stop event mask has
		 * the S_EXEC bit set.
		 */
		STOPEVENT(p, S_EXEC, 0);
	} else {
exec_fail:
		/* we're done here, clear P_INEXEC */
		PROC_LOCK(p);
		p->p_flag &= ~P_INEXEC;
		PROC_UNLOCK(p);

		SDT_PROBE1(proc, , , exec__failure, error);
	}

	if (imgp->newcred != NULL && oldcred != NULL)
		crfree(imgp->newcred);

#ifdef MAC
	mac_execve_exit(imgp);
	mac_execve_interpreter_exit(interpvplabel);
#endif
	exec_free_args(args);

	/*
	 * Handle deferred decrement of ref counts.
	 */
	if (oldtextvp != NULL)
		vrele(oldtextvp);
#ifdef KTRACE
	if (tracevp != NULL)
		vrele(tracevp);
	if (tracecred != NULL)
		crfree(tracecred);
#endif
	pargs_drop(oldargs);
	pargs_drop(newargs);
	if (oldsigacts != NULL)
		sigacts_free(oldsigacts);
	if (euip != NULL)
		uifree(euip);

	if (error && imgp->vmspace_destroyed) {
		/* sorry, no more process anymore. exit gracefully */
		exit1(td, 0, SIGABRT);
		/* NOT REACHED */
	}

#ifdef KTRACE
	if (error == 0)
		ktrprocctor(p);
#endif

	/*
	 * We don't want cpu_set_syscall_retval() to overwrite any of
	 * the register values put in place by exec_setregs().
	 * Implementations of cpu_set_syscall_retval() will leave
	 * registers unmodified when returning EJUSTRETURN.
	 */
	return (error == 0 ? EJUSTRETURN : error);
}
Пример #8
0
static int
link_elf_obj_load_file(const char *filename, linker_file_t * result)
{
	struct nlookupdata nd;
	struct thread  *td = curthread;	/* XXX */
	struct proc    *p = td->td_proc;
	char           *pathname;
	struct vnode   *vp;
	Elf_Ehdr       *hdr;
	Elf_Shdr       *shdr;
	Elf_Sym        *es;
	int		nbytes, i, j;
	vm_offset_t	mapbase;
	size_t		mapsize;
	int		error = 0;
	int		resid;
	elf_file_t	ef;
	linker_file_t	lf;
	int		symtabindex;
	int		symstrindex;
	int		shstrindex;
	int		nsym;
	int		pb, rl, ra;
	int		alignmask;

	/* XXX Hack for firmware loading where p == NULL */
	if (p == NULL) {
		p = &proc0;
	}

	KKASSERT(p != NULL);
	if (p->p_ucred == NULL) {
		kprintf("link_elf_obj_load_file: cannot load '%s' from filesystem"
			" this early\n", filename);
		return ENOENT;
	}
	shdr = NULL;
	lf = NULL;
	mapsize = 0;
	hdr = NULL;
	pathname = linker_search_path(filename);
	if (pathname == NULL)
		return ENOENT;

	error = nlookup_init(&nd, pathname, UIO_SYSSPACE, NLC_FOLLOW | NLC_LOCKVP);
	if (error == 0)
		error = vn_open(&nd, NULL, FREAD, 0);
	kfree(pathname, M_LINKER);
	if (error) {
		nlookup_done(&nd);
		return error;
	}
	vp = nd.nl_open_vp;
	nd.nl_open_vp = NULL;
	nlookup_done(&nd);

	/*
	 * Read the elf header from the file.
	 */
	hdr = kmalloc(sizeof(*hdr), M_LINKER, M_WAITOK);
	if (hdr == NULL) {
		error = ENOMEM;
		goto out;
	}
	error = vn_rdwr(UIO_READ, vp, (void *)hdr, sizeof(*hdr), 0,
			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
	if (error)
		goto out;
	if (resid != 0) {
		error = ENOEXEC;
		goto out;
	}
	if (!IS_ELF(*hdr)) {
		error = ENOEXEC;
		goto out;
	}

	if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
	    || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
		link_elf_obj_error(filename, "Unsupported file layout");
		error = ENOEXEC;
		goto out;
	}
	if (hdr->e_ident[EI_VERSION] != EV_CURRENT
	    || hdr->e_version != EV_CURRENT) {
		link_elf_obj_error(filename, "Unsupported file version");
		error = ENOEXEC;
		goto out;
	}
	if (hdr->e_type != ET_REL) {
		error = ENOSYS;
		goto out;
	}
	if (hdr->e_machine != ELF_TARG_MACH) {
		link_elf_obj_error(filename, "Unsupported machine");
		error = ENOEXEC;
		goto out;
	}

	ef = kmalloc(sizeof(struct elf_file), M_LINKER, M_WAITOK | M_ZERO);
	lf = linker_make_file(filename, ef, &link_elf_obj_file_ops);
	if (lf == NULL) {
		kfree(ef, M_LINKER);
		error = ENOMEM;
		goto out;
	}
	ef->nprogtab = 0;
	ef->e_shdr = 0;
	ef->nreltab = 0;
	ef->nrelatab = 0;

	/* Allocate and read in the section header */
	nbytes = hdr->e_shnum * hdr->e_shentsize;
	if (nbytes == 0 || hdr->e_shoff == 0 ||
	    hdr->e_shentsize != sizeof(Elf_Shdr)) {
		error = ENOEXEC;
		goto out;
	}
	shdr = kmalloc(nbytes, M_LINKER, M_WAITOK);
	if (shdr == NULL) {
		error = ENOMEM;
		goto out;
	}
	ef->e_shdr = shdr;
	error = vn_rdwr(UIO_READ, vp, (caddr_t) shdr, nbytes, hdr->e_shoff,
			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
	if (error)
		goto out;
	if (resid) {
		error = ENOEXEC;
		goto out;
	}
	/* Scan the section header for information and table sizing. */
	nsym = 0;
	symtabindex = -1;
	symstrindex = -1;
	for (i = 0; i < hdr->e_shnum; i++) {
		switch (shdr[i].sh_type) {
		case SHT_PROGBITS:
		case SHT_NOBITS:
			ef->nprogtab++;
			break;
		case SHT_SYMTAB:
			nsym++;
			symtabindex = i;
			symstrindex = shdr[i].sh_link;
			break;
		case SHT_REL:
			ef->nreltab++;
			break;
		case SHT_RELA:
			ef->nrelatab++;
			break;
		case SHT_STRTAB:
			break;
		}
	}
	if (ef->nprogtab == 0) {
		link_elf_obj_error(filename, "file has no contents");
		error = ENOEXEC;
		goto out;
	}
	if (nsym != 1) {
		/* Only allow one symbol table for now */
		link_elf_obj_error(filename, "file has no valid symbol table");
		error = ENOEXEC;
		goto out;
	}
	if (symstrindex < 0 || symstrindex > hdr->e_shnum ||
	    shdr[symstrindex].sh_type != SHT_STRTAB) {
		link_elf_obj_error(filename, "file has invalid symbol strings");
		error = ENOEXEC;
		goto out;
	}
	/* Allocate space for tracking the load chunks */
	if (ef->nprogtab != 0)
		ef->progtab = kmalloc(ef->nprogtab * sizeof(*ef->progtab),
				      M_LINKER, M_WAITOK | M_ZERO);
	if (ef->nreltab != 0)
		ef->reltab = kmalloc(ef->nreltab * sizeof(*ef->reltab),
				     M_LINKER, M_WAITOK | M_ZERO);
	if (ef->nrelatab != 0)
		ef->relatab = kmalloc(ef->nrelatab * sizeof(*ef->relatab),
				      M_LINKER, M_WAITOK | M_ZERO);
	if ((ef->nprogtab != 0 && ef->progtab == NULL) ||
	    (ef->nreltab != 0 && ef->reltab == NULL) ||
	    (ef->nrelatab != 0 && ef->relatab == NULL)) {
		error = ENOMEM;
		goto out;
	}
	if (symtabindex == -1)
		panic("lost symbol table index");
	/* Allocate space for and load the symbol table */
	ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym);
	ef->ddbsymtab = kmalloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK);
	if (ef->ddbsymtab == NULL) {
		error = ENOMEM;
		goto out;
	}
	error = vn_rdwr(UIO_READ, vp, (void *)ef->ddbsymtab,
			shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset,
			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
	if (error)
		goto out;
	if (resid != 0) {
		error = EINVAL;
		goto out;
	}
	if (symstrindex == -1)
		panic("lost symbol string index");
	/* Allocate space for and load the symbol strings */
	ef->ddbstrcnt = shdr[symstrindex].sh_size;
	ef->ddbstrtab = kmalloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK);
	if (ef->ddbstrtab == NULL) {
		error = ENOMEM;
		goto out;
	}
	error = vn_rdwr(UIO_READ, vp, ef->ddbstrtab,
			shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset,
			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
	if (error)
		goto out;
	if (resid != 0) {
		error = EINVAL;
		goto out;
	}
	/* Do we have a string table for the section names?  */
	shstrindex = -1;
	if (hdr->e_shstrndx != 0 &&
	    shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) {
		shstrindex = hdr->e_shstrndx;
		ef->shstrcnt = shdr[shstrindex].sh_size;
		ef->shstrtab = kmalloc(shdr[shstrindex].sh_size, M_LINKER,
				       M_WAITOK);
		if (ef->shstrtab == NULL) {
			error = ENOMEM;
			goto out;
		}
		error = vn_rdwr(UIO_READ, vp, ef->shstrtab,
				shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset,
				UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
		if (error)
			goto out;
		if (resid != 0) {
			error = EINVAL;
			goto out;
		}
	}
	/* Size up code/data(progbits) and bss(nobits). */
	alignmask = 0;
	for (i = 0; i < hdr->e_shnum; i++) {
		switch (shdr[i].sh_type) {
		case SHT_PROGBITS:
		case SHT_NOBITS:
			alignmask = shdr[i].sh_addralign - 1;
			mapsize += alignmask;
			mapsize &= ~alignmask;
			mapsize += shdr[i].sh_size;
			break;
		}
	}

	/*
	 * We know how much space we need for the text/data/bss/etc. This
	 * stuff needs to be in a single chunk so that profiling etc can get
	 * the bounds and gdb can associate offsets with modules
	 */
	ef->object = vm_object_allocate(OBJT_DEFAULT,
					round_page(mapsize) >> PAGE_SHIFT);
	if (ef->object == NULL) {
		error = ENOMEM;
		goto out;
	}
	vm_object_reference(ef->object);
	ef->address = (caddr_t) vm_map_min(&kernel_map);
	ef->bytes = 0;

	/*
	 * In order to satisfy x86_64's architectural requirements on the
	 * location of code and data in the kernel's address space, request a
	 * mapping that is above the kernel.
	 *
	 * vkernel64's text+data is outside the managed VM space entirely.
	 */
#if defined(__amd64__) && defined(_KERNEL_VIRTUAL)
	error = vkernel_module_memory_alloc(&mapbase, round_page(mapsize));
#else
	mapbase = KERNBASE;
	error = vm_map_find(&kernel_map, ef->object, 0, &mapbase,
			    round_page(mapsize), PAGE_SIZE,
			    TRUE, VM_MAPTYPE_NORMAL,
			    VM_PROT_ALL, VM_PROT_ALL, FALSE);
	if (error) {
		vm_object_deallocate(ef->object);
		ef->object = NULL;
		goto out;
	}
	/* Wire the pages */
	error = vm_map_wire(&kernel_map, mapbase,
			    mapbase + round_page(mapsize), 0);
#endif
	if (error != KERN_SUCCESS) {
		error = ENOMEM;
		goto out;
	}
	/* Inform the kld system about the situation */
	lf->address = ef->address = (caddr_t) mapbase;
	lf->size = round_page(mapsize);
	ef->bytes = mapsize;

	/*
	 * Now load code/data(progbits), zero bss(nobits), allocate space for
	 * and load relocs
	 */
	pb = 0;
	rl = 0;
	ra = 0;
	alignmask = 0;
	for (i = 0; i < hdr->e_shnum; i++) {
		switch (shdr[i].sh_type) {
		case SHT_PROGBITS:
		case SHT_NOBITS:
			alignmask = shdr[i].sh_addralign - 1;
			mapbase += alignmask;
			mapbase &= ~alignmask;
			if (ef->shstrtab && shdr[i].sh_name != 0)
				ef->progtab[pb].name =
					ef->shstrtab + shdr[i].sh_name;
			else if (shdr[i].sh_type == SHT_PROGBITS)
				ef->progtab[pb].name = "<<PROGBITS>>";
			else
				ef->progtab[pb].name = "<<NOBITS>>";
#if 0
			if (ef->progtab[pb].name != NULL &&
			    !strcmp(ef->progtab[pb].name, "set_pcpu"))
				ef->progtab[pb].addr =
					dpcpu_alloc(shdr[i].sh_size);
#ifdef VIMAGE
			else if (ef->progtab[pb].name != NULL &&
				 !strcmp(ef->progtab[pb].name, VNET_SETNAME))
				ef->progtab[pb].addr =
					vnet_data_alloc(shdr[i].sh_size);
#endif
			else
#endif
				ef->progtab[pb].addr =
					(void *)(uintptr_t) mapbase;
			if (ef->progtab[pb].addr == NULL) {
				error = ENOSPC;
				goto out;
			}
			ef->progtab[pb].size = shdr[i].sh_size;
			ef->progtab[pb].sec = i;
			if (shdr[i].sh_type == SHT_PROGBITS) {
				error = vn_rdwr(UIO_READ, vp,
						ef->progtab[pb].addr,
						shdr[i].sh_size, shdr[i].sh_offset,
						UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred,
						&resid);
				if (error)
					goto out;
				if (resid != 0) {
					error = EINVAL;
					goto out;
				}
#if 0
				/* Initialize the per-cpu or vnet area. */
				if (ef->progtab[pb].addr != (void *)mapbase &&
				    !strcmp(ef->progtab[pb].name, "set_pcpu"))
					dpcpu_copy(ef->progtab[pb].addr,
						   shdr[i].sh_size);
#ifdef VIMAGE
				else if (ef->progtab[pb].addr !=
					 (void *)mapbase &&
					 !strcmp(ef->progtab[pb].name, VNET_SETNAME))
					vnet_data_copy(ef->progtab[pb].addr,
						       shdr[i].sh_size);
#endif
#endif
			} else
				bzero(ef->progtab[pb].addr, shdr[i].sh_size);

			/* Update all symbol values with the offset. */
			for (j = 0; j < ef->ddbsymcnt; j++) {
				es = &ef->ddbsymtab[j];
				if (es->st_shndx != i)
					continue;
				es->st_value += (Elf_Addr) ef->progtab[pb].addr;
			}
			mapbase += shdr[i].sh_size;
			pb++;
			break;
		case SHT_REL:
			ef->reltab[rl].rel = kmalloc(shdr[i].sh_size, M_LINKER, M_WAITOK);
			ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel);
			ef->reltab[rl].sec = shdr[i].sh_info;
			error = vn_rdwr(UIO_READ, vp,
					(void *)ef->reltab[rl].rel,
					shdr[i].sh_size, shdr[i].sh_offset,
					UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
			if (error)
				goto out;
			if (resid != 0) {
				error = EINVAL;
				goto out;
			}
			rl++;
			break;
		case SHT_RELA:
			ef->relatab[ra].rela = kmalloc(shdr[i].sh_size, M_LINKER, M_WAITOK);
			ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela);
			ef->relatab[ra].sec = shdr[i].sh_info;
			error = vn_rdwr(UIO_READ, vp,
					(void *)ef->relatab[ra].rela,
					shdr[i].sh_size, shdr[i].sh_offset,
					UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
			if (error)
				goto out;
			if (resid != 0) {
				error = EINVAL;
				goto out;
			}
			ra++;
			break;
		}
	}
	if (pb != ef->nprogtab)
		panic("lost progbits");
	if (rl != ef->nreltab)
		panic("lost reltab");
	if (ra != ef->nrelatab)
		panic("lost relatab");
	if (mapbase != (vm_offset_t) ef->address + mapsize)
		panic("mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n",
		      mapbase, ef->address, mapsize,
		      (vm_offset_t) ef->address + mapsize);

	/* Local intra-module relocations */
	link_elf_obj_reloc_local(lf);

	/* Pull in dependencies */
	error = linker_load_dependencies(lf);
	if (error)
		goto out;

	/* External relocations */
	error = relocate_file(lf);
	if (error)
		goto out;

	*result = lf;

out:
	if (error && lf)
		linker_file_unload(lf /*, LINKER_UNLOAD_FORCE */);
	if (hdr)
		kfree(hdr, M_LINKER);
	vn_unlock(vp);
	vn_close(vp, FREAD);

	return error;
}
Пример #9
0
/*
 * vm_contig_pg_kmap:
 *
 * Map previously allocated (vm_contig_pg_alloc) range of pages from
 * vm_page_array[] into the KVA.  Once mapped, the pages are part of
 * the Kernel, and are to free'ed with kmem_free(&kernel_map, addr, size).
 *
 * No requirements.
 */
vm_offset_t
vm_contig_pg_kmap(int start, u_long size, vm_map_t map, int flags)
{
	vm_offset_t addr, tmp_addr;
	vm_page_t pga = vm_page_array;
	int i, count;

	size = round_page(size);
	if (size == 0)
		panic("vm_contig_pg_kmap: size must not be 0");

	crit_enter();
	lwkt_gettoken(&vm_token);

	/*
	 * We've found a contiguous chunk that meets our requirements.
	 * Allocate KVM, and assign phys pages and return a kernel VM
	 * pointer.
	 */
	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	vm_map_lock(map);
	if (vm_map_findspace(map, vm_map_min(map), size, PAGE_SIZE, 0, &addr) !=
	    KERN_SUCCESS) {
		/*
		 * XXX We almost never run out of kernel virtual
		 * space, so we don't make the allocated memory
		 * above available.
		 */
		vm_map_unlock(map);
		vm_map_entry_release(count);
		lwkt_reltoken(&vm_token);
		crit_exit();
		return (0);
	}

	/*
	 * kernel_object maps 1:1 to kernel_map.
	 */
	vm_object_hold(&kernel_object);
	vm_object_reference(&kernel_object);
	vm_map_insert(map, &count, 
		      &kernel_object, addr,
		      addr, addr + size,
		      VM_MAPTYPE_NORMAL,
		      VM_PROT_ALL, VM_PROT_ALL,
		      0);
	vm_map_unlock(map);
	vm_map_entry_release(count);

	tmp_addr = addr;
	for (i = start; i < (start + size / PAGE_SIZE); i++) {
		vm_page_t m = &pga[i];
		vm_page_insert(m, &kernel_object, OFF_TO_IDX(tmp_addr));
		if ((flags & M_ZERO) && !(m->flags & PG_ZERO))
			pmap_zero_page(VM_PAGE_TO_PHYS(m));
		m->flags = 0;
		tmp_addr += PAGE_SIZE;
 	}
	vm_map_wire(map, addr, addr + size, 0);

	vm_object_drop(&kernel_object);

	lwkt_reltoken(&vm_token);
	crit_exit();
	return (addr);
}
Пример #10
0
/*
 * Destroy old address space, and allocate a new stack.
 *	The new stack is only sgrowsiz large because it is grown
 *	automatically on a page fault.
 */
int
exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv)
{
	int error;
	struct proc *p = imgp->proc;
	struct vmspace *vmspace = p->p_vmspace;
	vm_object_t obj;
	struct rlimit rlim_stack;
	vm_offset_t sv_minuser, stack_addr;
	vm_map_t map;
	u_long ssiz;

	imgp->vmspace_destroyed = 1;
	imgp->sysent = sv;

	/* May be called with Giant held */
	EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp);

	/*
	 * Blow away entire process VM, if address space not shared,
	 * otherwise, create a new VM space so that other threads are
	 * not disrupted
	 */
	map = &vmspace->vm_map;
	if (map_at_zero)
		sv_minuser = sv->sv_minuser;
	else
		sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
	    vm_map_max(map) == sv->sv_maxuser &&
	    cpu_exec_vmspace_reuse(p, map)) {
		shmexit(vmspace);
		pmap_remove_pages(vmspace_pmap(vmspace));
		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
		/*
		 * An exec terminates mlockall(MCL_FUTURE), ASLR state
		 * must be re-evaluated.
		 */
		vm_map_lock(map);
		vm_map_modflags(map, 0, MAP_WIREFUTURE | MAP_ASLR |
		    MAP_ASLR_IGNSTART);
		vm_map_unlock(map);
	} else {
		error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
		if (error)
			return (error);
		vmspace = p->p_vmspace;
		map = &vmspace->vm_map;
	}
	map->flags |= imgp->map_flags;

	/* Map a shared page */
	obj = sv->sv_shared_page_obj;
	if (obj != NULL) {
		vm_object_reference(obj);
		error = vm_map_fixed(map, obj, 0,
		    sv->sv_shared_page_base, sv->sv_shared_page_len,
		    VM_PROT_READ | VM_PROT_EXECUTE,
		    VM_PROT_READ | VM_PROT_EXECUTE,
		    MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
		if (error != KERN_SUCCESS) {
			vm_object_deallocate(obj);
			return (vm_mmap_to_errno(error));
		}
	}

	/* Allocate a new stack */
	if (imgp->stack_sz != 0) {
		ssiz = trunc_page(imgp->stack_sz);
		PROC_LOCK(p);
		lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack);
		PROC_UNLOCK(p);
		if (ssiz > rlim_stack.rlim_max)
			ssiz = rlim_stack.rlim_max;
		if (ssiz > rlim_stack.rlim_cur) {
			rlim_stack.rlim_cur = ssiz;
			kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack);
		}
	} else if (sv->sv_maxssiz != NULL) {
		ssiz = *sv->sv_maxssiz;
	} else {
		ssiz = maxssiz;
	}
	stack_addr = sv->sv_usrstack - ssiz;
	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
	    obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
	if (error != KERN_SUCCESS)
		return (vm_mmap_to_errno(error));

	/*
	 * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they
	 * are still used to enforce the stack rlimit on the process stack.
	 */
	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
	vmspace->vm_maxsaddr = (char *)stack_addr;

	return (0);
}
Пример #11
0
DECLHIDDEN(int) rtR0MemObjNativeMapKernel(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJ pMemToMap, void *pvFixed, size_t uAlignment,
                                          unsigned fProt, size_t offSub, size_t cbSub)
{
//  AssertMsgReturn(!offSub && !cbSub, ("%#x %#x\n", offSub, cbSub), VERR_NOT_SUPPORTED);
    AssertMsgReturn(pvFixed == (void *)-1, ("%p\n", pvFixed), VERR_NOT_SUPPORTED);

    /*
     * Check that the specified alignment is supported.
     */
    if (uAlignment > PAGE_SIZE)
        return VERR_NOT_SUPPORTED;

    int                rc;
    PRTR0MEMOBJFREEBSD pMemToMapFreeBSD = (PRTR0MEMOBJFREEBSD)pMemToMap;

    /* calc protection */
    vm_prot_t       ProtectionFlags = 0;
    if ((fProt & RTMEM_PROT_NONE) == RTMEM_PROT_NONE)
        ProtectionFlags = VM_PROT_NONE;
    if ((fProt & RTMEM_PROT_READ) == RTMEM_PROT_READ)
        ProtectionFlags |= VM_PROT_READ;
    if ((fProt & RTMEM_PROT_WRITE) == RTMEM_PROT_WRITE)
        ProtectionFlags |= VM_PROT_WRITE;
    if ((fProt & RTMEM_PROT_EXEC) == RTMEM_PROT_EXEC)
        ProtectionFlags |= VM_PROT_EXECUTE;

    vm_offset_t  Addr = vm_map_min(kernel_map);
    if (cbSub == 0)
        cbSub = pMemToMap->cb - offSub;

    vm_object_reference(pMemToMapFreeBSD->pObject);
    rc = vm_map_find(kernel_map,            /* Map to insert the object in */
                     pMemToMapFreeBSD->pObject, /* Object to map */
                     offSub,                /* Start offset in the object */
                     &Addr,                 /* Start address IN/OUT */
                     cbSub,                 /* Size of the mapping */
#if __FreeBSD_version >= 1000055
                     0,                     /* Upper bound of mapping */
#endif
                     VMFS_ANY_SPACE,        /* Whether a suitable address should be searched for first */
                     ProtectionFlags,       /* protection flags */
                     VM_PROT_ALL,           /* Maximum protection flags */
                     0);                    /* copy-on-write and similar flags */

    if (rc == KERN_SUCCESS)
    {
        rc = vm_map_wire(kernel_map, Addr, Addr + cbSub, VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
        AssertMsg(rc == KERN_SUCCESS, ("%#x\n", rc));

        PRTR0MEMOBJFREEBSD pMemFreeBSD = (PRTR0MEMOBJFREEBSD)rtR0MemObjNew(sizeof(RTR0MEMOBJFREEBSD),
                                                                           RTR0MEMOBJTYPE_MAPPING,
                                                                           (void *)Addr,
                                                                           cbSub);
        if (pMemFreeBSD)
        {
            Assert((vm_offset_t)pMemFreeBSD->Core.pv == Addr);
            pMemFreeBSD->Core.u.Mapping.R0Process = NIL_RTR0PROCESS;
            *ppMem = &pMemFreeBSD->Core;
            return VINF_SUCCESS;
        }
        rc = vm_map_remove(kernel_map, Addr, Addr + cbSub);
        AssertMsg(rc == KERN_SUCCESS, ("Deleting mapping failed\n"));
    }
    else
        vm_object_deallocate(pMemToMapFreeBSD->pObject);

    return VERR_NO_MEMORY;
}
Пример #12
0
DECLHIDDEN(int) rtR0MemObjNativeMapUser(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJ pMemToMap, RTR3PTR R3PtrFixed, size_t uAlignment,
                                        unsigned fProt, RTR0PROCESS R0Process)
{
    /*
     * Check for unsupported stuff.
     */
    AssertMsgReturn(R0Process == RTR0ProcHandleSelf(), ("%p != %p\n", R0Process, RTR0ProcHandleSelf()), VERR_NOT_SUPPORTED);
    if (uAlignment > PAGE_SIZE)
        return VERR_NOT_SUPPORTED;

    int                rc;
    PRTR0MEMOBJFREEBSD pMemToMapFreeBSD = (PRTR0MEMOBJFREEBSD)pMemToMap;
    struct proc       *pProc            = (struct proc *)R0Process;
    struct vm_map     *pProcMap         = &pProc->p_vmspace->vm_map;

    /* calc protection */
    vm_prot_t       ProtectionFlags = 0;
    if ((fProt & RTMEM_PROT_NONE) == RTMEM_PROT_NONE)
        ProtectionFlags = VM_PROT_NONE;
    if ((fProt & RTMEM_PROT_READ) == RTMEM_PROT_READ)
        ProtectionFlags |= VM_PROT_READ;
    if ((fProt & RTMEM_PROT_WRITE) == RTMEM_PROT_WRITE)
        ProtectionFlags |= VM_PROT_WRITE;
    if ((fProt & RTMEM_PROT_EXEC) == RTMEM_PROT_EXEC)
        ProtectionFlags |= VM_PROT_EXECUTE;

    /* calc mapping address */
    vm_offset_t AddrR3;
    if (R3PtrFixed == (RTR3PTR)-1)
    {
        /** @todo: is this needed?. */
        PROC_LOCK(pProc);
        AddrR3 = round_page((vm_offset_t)pProc->p_vmspace->vm_daddr + lim_max(pProc, RLIMIT_DATA));
        PROC_UNLOCK(pProc);
    }
    else
        AddrR3 = (vm_offset_t)R3PtrFixed;

    /* Insert the pObject in the map. */
    vm_object_reference(pMemToMapFreeBSD->pObject);
    rc = vm_map_find(pProcMap,              /* Map to insert the object in */
                     pMemToMapFreeBSD->pObject, /* Object to map */
                     0,                     /* Start offset in the object */
                     &AddrR3,               /* Start address IN/OUT */
                     pMemToMap->cb,         /* Size of the mapping */
#if __FreeBSD_version >= 1000055
                     0,                     /* Upper bound of the mapping */
#endif
                     R3PtrFixed == (RTR3PTR)-1 ? VMFS_ANY_SPACE : VMFS_NO_SPACE,
                                            /* Whether a suitable address should be searched for first */
                     ProtectionFlags,       /* protection flags */
                     VM_PROT_ALL,           /* Maximum protection flags */
                     0);                    /* copy-on-write and similar flags */

    if (rc == KERN_SUCCESS)
    {
        rc = vm_map_wire(pProcMap, AddrR3, AddrR3 + pMemToMap->cb, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
        AssertMsg(rc == KERN_SUCCESS, ("%#x\n", rc));

        rc = vm_map_inherit(pProcMap, AddrR3, AddrR3 + pMemToMap->cb, VM_INHERIT_SHARE);
        AssertMsg(rc == KERN_SUCCESS, ("%#x\n", rc));

        /*
         * Create a mapping object for it.
         */
        PRTR0MEMOBJFREEBSD pMemFreeBSD = (PRTR0MEMOBJFREEBSD)rtR0MemObjNew(sizeof(RTR0MEMOBJFREEBSD),
                                                                           RTR0MEMOBJTYPE_MAPPING,
                                                                           (void *)AddrR3,
                                                                           pMemToMap->cb);
        if (pMemFreeBSD)
        {
            Assert((vm_offset_t)pMemFreeBSD->Core.pv == AddrR3);
            pMemFreeBSD->Core.u.Mapping.R0Process = R0Process;
            *ppMem = &pMemFreeBSD->Core;
            return VINF_SUCCESS;
        }

        rc = vm_map_remove(pProcMap, AddrR3, AddrR3 + pMemToMap->cb);
        AssertMsg(rc == KERN_SUCCESS, ("Deleting mapping failed\n"));
    }
    else
        vm_object_deallocate(pMemToMapFreeBSD->pObject);

    return VERR_NO_MEMORY;
}
Пример #13
0
kern_return_t
kernel_memory_allocate(
	register vm_map_t	map,
	register vm_offset_t	*addrp,
	register vm_size_t	size,
	register vm_offset_t	mask,
	int			flags)
{
	vm_object_t 		object;
	vm_object_offset_t 	offset;
	vm_object_offset_t 	pg_offset;
	vm_map_entry_t 		entry;
	vm_map_offset_t 	map_addr, fill_start;
	vm_map_offset_t		map_mask;
	vm_map_size_t		map_size, fill_size;
	kern_return_t 		kr;
	vm_page_t		mem;
	vm_page_t		guard_page_list = NULL;
	vm_page_t		wired_page_list = NULL;
	int			guard_page_count = 0;
	int			wired_page_count = 0;
	int			i;
	int			vm_alloc_flags;

	if (! vm_kernel_ready) {
		panic("kernel_memory_allocate: VM is not ready");
	}

	if (size == 0) {
		*addrp = 0;
		return KERN_INVALID_ARGUMENT;
	}
	map_size = vm_map_round_page(size);
	map_mask = (vm_map_offset_t) mask;
	vm_alloc_flags = 0;


	/*
	 * limit the size of a single extent of wired memory
	 * to try and limit the damage to the system if
	 * too many pages get wired down
	 */
        if (map_size > (1 << 30)) {
                return KERN_RESOURCE_SHORTAGE;
        }

	/*
	 * Guard pages:
	 *
	 * Guard pages are implemented as ficticious pages.  By placing guard pages
	 * on either end of a stack, they can help detect cases where a thread walks
	 * off either end of its stack.  They are allocated and set up here and attempts
	 * to access those pages are trapped in vm_fault_page().
	 *
	 * The map_size we were passed may include extra space for
	 * guard pages.  If those were requested, then back it out of fill_size
	 * since vm_map_find_space() takes just the actual size not including
	 * guard pages.  Similarly, fill_start indicates where the actual pages
	 * will begin in the range.
	 */

	fill_start = 0;
	fill_size = map_size;

	if (flags & KMA_GUARD_FIRST) {
		vm_alloc_flags |= VM_FLAGS_GUARD_BEFORE;
		fill_start += PAGE_SIZE_64;
		fill_size -= PAGE_SIZE_64;
		if (map_size < fill_start + fill_size) {
			/* no space for a guard page */
			*addrp = 0;
			return KERN_INVALID_ARGUMENT;
		}
		guard_page_count++;
	}
	if (flags & KMA_GUARD_LAST) {
		vm_alloc_flags |= VM_FLAGS_GUARD_AFTER;
		fill_size -= PAGE_SIZE_64;
		if (map_size <= fill_start + fill_size) {
			/* no space for a guard page */
			*addrp = 0;
			return KERN_INVALID_ARGUMENT;
		}
		guard_page_count++;
	}
	wired_page_count = (int) (fill_size / PAGE_SIZE_64);
	assert(wired_page_count * PAGE_SIZE_64 == fill_size);

	for (i = 0; i < guard_page_count; i++) {
		for (;;) {
			mem = vm_page_grab_guard();

			if (mem != VM_PAGE_NULL)
				break;
			if (flags & KMA_NOPAGEWAIT) {
				kr = KERN_RESOURCE_SHORTAGE;
				goto out;
			}
			vm_page_more_fictitious();
		}
		mem->pageq.next = (queue_entry_t)guard_page_list;
		guard_page_list = mem;
	}

	for (i = 0; i < wired_page_count; i++) {
		uint64_t	unavailable;
		
		for (;;) {
		        if (flags & KMA_LOMEM)
			        mem = vm_page_grablo();
			else
			        mem = vm_page_grab();

		        if (mem != VM_PAGE_NULL)
			        break;

			if (flags & KMA_NOPAGEWAIT) {
				kr = KERN_RESOURCE_SHORTAGE;
				goto out;
			}
			if ((flags & KMA_LOMEM) && (vm_lopage_needed == TRUE)) {
				kr = KERN_RESOURCE_SHORTAGE;
				goto out;
			}
			unavailable = (vm_page_wire_count + vm_page_free_target) * PAGE_SIZE;

			if (unavailable > max_mem || map_size > (max_mem - unavailable)) {
				kr = KERN_RESOURCE_SHORTAGE;
				goto out;
			}
			VM_PAGE_WAIT();
		}
		mem->pageq.next = (queue_entry_t)wired_page_list;
		wired_page_list = mem;
	}

	/*
	 *	Allocate a new object (if necessary).  We must do this before
	 *	locking the map, or risk deadlock with the default pager.
	 */
	if ((flags & KMA_KOBJECT) != 0) {
		object = kernel_object;
		vm_object_reference(object);
	} else {
		object = vm_object_allocate(map_size);
	}

	kr = vm_map_find_space(map, &map_addr,
			       fill_size, map_mask,
			       vm_alloc_flags, &entry);
	if (KERN_SUCCESS != kr) {
		vm_object_deallocate(object);
		goto out;
	}

	entry->object.vm_object = object;
	entry->offset = offset = (object == kernel_object) ? 
		        map_addr : 0;

	entry->wired_count++;

	if (flags & KMA_PERMANENT)
		entry->permanent = TRUE;

	if (object != kernel_object)
		vm_object_reference(object);

	vm_object_lock(object);
	vm_map_unlock(map);

	pg_offset = 0;

	if (fill_start) {
		if (guard_page_list == NULL)
			panic("kernel_memory_allocate: guard_page_list == NULL");

		mem = guard_page_list;
		guard_page_list = (vm_page_t)mem->pageq.next;
		mem->pageq.next = NULL;

		vm_page_insert(mem, object, offset + pg_offset);

		mem->busy = FALSE;
		pg_offset += PAGE_SIZE_64;
	}
	for (pg_offset = fill_start; pg_offset < fill_start + fill_size; pg_offset += PAGE_SIZE_64) {
		if (wired_page_list == NULL)
			panic("kernel_memory_allocate: wired_page_list == NULL");

		mem = wired_page_list;
		wired_page_list = (vm_page_t)mem->pageq.next;
		mem->pageq.next = NULL;
		mem->wire_count++;

		vm_page_insert(mem, object, offset + pg_offset);

		mem->busy = FALSE;
		mem->pmapped = TRUE;
		mem->wpmapped = TRUE;

		PMAP_ENTER(kernel_pmap, map_addr + pg_offset, mem, 
			   VM_PROT_READ | VM_PROT_WRITE, object->wimg_bits & VM_WIMG_MASK, TRUE);

		if (flags & KMA_NOENCRYPT) {
			bzero(CAST_DOWN(void *, (map_addr + pg_offset)), PAGE_SIZE);

			pmap_set_noencrypt(mem->phys_page);
		}
	}
Пример #14
0
static void
mac_proc_vm_revoke_recurse(struct thread *td, struct ucred *cred,
    struct vm_map *map)
{
	vm_map_entry_t vme;
	int vfslocked, result;
	vm_prot_t revokeperms;
	vm_object_t backing_object, object;
	vm_ooffset_t offset;
	struct vnode *vp;
	struct mount *mp;

	if (!mac_mmap_revocation)
		return;

	vm_map_lock(map);
	for (vme = map->header.next; vme != &map->header; vme = vme->next) {
		if (vme->eflags & MAP_ENTRY_IS_SUB_MAP) {
			mac_proc_vm_revoke_recurse(td, cred,
			    vme->object.sub_map);
			continue;
		}
		/*
		 * Skip over entries that obviously are not shared.
		 */
		if (vme->eflags & (MAP_ENTRY_COW | MAP_ENTRY_NOSYNC) ||
		    !vme->max_protection)
			continue;
		/*
		 * Drill down to the deepest backing object.
		 */
		offset = vme->offset;
		object = vme->object.vm_object;
		if (object == NULL)
			continue;
		VM_OBJECT_LOCK(object);
		while ((backing_object = object->backing_object) != NULL) {
			VM_OBJECT_LOCK(backing_object);
			offset += object->backing_object_offset;
			VM_OBJECT_UNLOCK(object);
			object = backing_object;
		}
		VM_OBJECT_UNLOCK(object);
		/*
		 * At the moment, vm_maps and objects aren't considered by
		 * the MAC system, so only things with backing by a normal
		 * object (read: vnodes) are checked.
		 */
		if (object->type != OBJT_VNODE)
			continue;
		vp = (struct vnode *)object->handle;
		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
		result = vme->max_protection;
		mac_vnode_check_mmap_downgrade(cred, vp, &result);
		VOP_UNLOCK(vp, 0);
		/*
		 * Find out what maximum protection we may be allowing now
		 * but a policy needs to get removed.
		 */
		revokeperms = vme->max_protection & ~result;
		if (!revokeperms) {
			VFS_UNLOCK_GIANT(vfslocked);
			continue;
		}
		printf("pid %ld: revoking %s perms from %#lx:%ld "
		    "(max %s/cur %s)\n", (long)td->td_proc->p_pid,
		    prot2str(revokeperms), (u_long)vme->start,
		    (long)(vme->end - vme->start),
		    prot2str(vme->max_protection), prot2str(vme->protection));
		/*
		 * This is the really simple case: if a map has more
		 * max_protection than is allowed, but it's not being
		 * actually used (that is, the current protection is still
		 * allowed), we can just wipe it out and do nothing more.
		 */
		if ((vme->protection & revokeperms) == 0) {
			vme->max_protection -= revokeperms;
		} else {
			if (revokeperms & VM_PROT_WRITE) {
				/*
				 * In the more complicated case, flush out all
				 * pending changes to the object then turn it
				 * copy-on-write.
				 */
				vm_object_reference(object);
				(void) vn_start_write(vp, &mp, V_WAIT);
				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
				VM_OBJECT_LOCK(object);
				vm_object_page_clean(object, offset, offset +
				    vme->end - vme->start, OBJPC_SYNC);
				VM_OBJECT_UNLOCK(object);
				VOP_UNLOCK(vp, 0);
				vn_finished_write(mp);
				vm_object_deallocate(object);
				/*
				 * Why bother if there's no read permissions
				 * anymore?  For the rest, we need to leave
				 * the write permissions on for COW, or
				 * remove them entirely if configured to.
				 */
				if (!mac_mmap_revocation_via_cow) {
					vme->max_protection &= ~VM_PROT_WRITE;
					vme->protection &= ~VM_PROT_WRITE;
				} if ((revokeperms & VM_PROT_READ) == 0)
					vme->eflags |= MAP_ENTRY_COW |
					    MAP_ENTRY_NEEDS_COPY;
			}
			if (revokeperms & VM_PROT_EXECUTE) {
				vme->max_protection &= ~VM_PROT_EXECUTE;
				vme->protection &= ~VM_PROT_EXECUTE;
			}
			if (revokeperms & VM_PROT_READ) {
				vme->max_protection = 0;
				vme->protection = 0;
			}
			pmap_protect(map->pmap, vme->start, vme->end,
			    vme->protection & ~revokeperms);
			vm_map_simplify_entry(map, vme);
		}
		VFS_UNLOCK_GIANT(vfslocked);
	}
	vm_map_unlock(map);
}
Пример #15
0
kern_return_t
projected_buffer_allocate(
	vm_map_t 	map,
	vm_size_t 	size,
       int 		persistence,
	vm_offset_t 	*kernel_p,
	vm_offset_t 	*user_p,
       vm_prot_t 	protection,
       vm_inherit_t 	inheritance)  /*Currently only VM_INHERIT_NONE supported*/
{
	vm_object_t object;
	vm_map_entry_t u_entry, k_entry;
	vm_offset_t addr;
	vm_size_t r_size;
	kern_return_t kr;

	if (map == VM_MAP_NULL || map == kernel_map)
	  return(KERN_INVALID_ARGUMENT);

	/*
	 *	Allocate a new object. 
	 */

	size = round_page(size);
	object = vm_object_allocate(size);

	vm_map_lock(kernel_map);
	kr = vm_map_find_entry(kernel_map, &addr, size, (vm_offset_t) 0,
			       VM_OBJECT_NULL, &k_entry);
	if (kr != KERN_SUCCESS) {
	  vm_map_unlock(kernel_map);
	  vm_object_deallocate(object);
	  return kr;
	}

	k_entry->object.vm_object = object;
	if (!persistence)
	  k_entry->projected_on = (vm_map_entry_t) -1;
              /*Mark entry so as to automatically deallocate it when
                last corresponding user entry is deallocated*/
	vm_map_unlock(kernel_map);
	*kernel_p = addr;

	vm_map_lock(map);
	kr = vm_map_find_entry(map, &addr, size, (vm_offset_t) 0,
			       VM_OBJECT_NULL, &u_entry);
	if (kr != KERN_SUCCESS) {
	  vm_map_unlock(map);
	  vm_map_lock(kernel_map);
	  vm_map_entry_delete(kernel_map, k_entry);
	  vm_map_unlock(kernel_map);
	  vm_object_deallocate(object);
	  return kr;
	}

	u_entry->object.vm_object = object;
	vm_object_reference(object);
	u_entry->projected_on = k_entry;
             /*Creates coupling with kernel mapping of the buffer, and
               also guarantees that user cannot directly manipulate
               buffer VM entry*/
	u_entry->protection = protection;
	u_entry->max_protection = protection;
	u_entry->inheritance = inheritance;
	vm_map_unlock(map);
       	*user_p = addr;

	/*
	 *	Allocate wired-down memory in the object,
	 *	and enter it in the kernel pmap.
	 */
	kmem_alloc_pages(object, 0,
			 *kernel_p, *kernel_p + size,
			 VM_PROT_READ | VM_PROT_WRITE);
	memset((void*) *kernel_p, 0, size);         /*Zero fill*/

	/* Set up physical mappings for user pmap */

	pmap_pageable(map->pmap, *user_p, *user_p + size, FALSE);
	for (r_size = 0; r_size < size; r_size += PAGE_SIZE) {
	  addr = pmap_extract(kernel_pmap, *kernel_p + r_size);
	  pmap_enter(map->pmap, *user_p + r_size, addr,
		     protection, TRUE);
	}

	return(KERN_SUCCESS);
}
Пример #16
0
kern_return_t
kmem_alloc_aligned(
	vm_map_t 	map,
	vm_offset_t 	*addrp,
	vm_size_t 	size)
{
	vm_map_entry_t entry;
	vm_offset_t offset;
	vm_offset_t addr;
	unsigned int attempts;
	kern_return_t kr;

	if ((size & (size - 1)) != 0)
		panic("kmem_alloc_aligned");

	/*
	 *	Use the kernel object for wired-down kernel pages.
	 *	Assume that no region of the kernel object is
	 *	referenced more than once.  We want vm_map_find_entry
	 *	to extend an existing entry if possible.
	 */

	size = round_page(size);
	attempts = 0;

retry:
	vm_map_lock(map);
	kr = vm_map_find_entry(map, &addr, size, size - 1,
			       kernel_object, &entry);
	if (kr != KERN_SUCCESS) {
		vm_map_unlock(map);

		if (attempts == 0) {
			attempts++;
			slab_collect();
			goto retry;
		}

		printf_once("no more rooom for kmem_alloc_aligned in %p\n", map);
		return kr;
	}

	/*
	 *	Since we didn't know where the new region would
	 *	start, we couldn't supply the correct offset into
	 *	the kernel object.  We only initialize the entry
	 *	if we aren't extending an existing entry.
	 */

	offset = addr - VM_MIN_KERNEL_ADDRESS;

	if (entry->object.vm_object == VM_OBJECT_NULL) {
		vm_object_reference(kernel_object);

		entry->object.vm_object = kernel_object;
		entry->offset = offset;
	}

	/*
	 *	Since we have not given out this address yet,
	 *	it is safe to unlock the map.
	 */
	vm_map_unlock(map);

	/*
	 *	Allocate wired-down memory in the kernel_object,
	 *	for this entry, and enter it in the kernel pmap.
	 */
	kmem_alloc_pages(kernel_object, offset,
			 addr, addr + size,
			 VM_PROT_DEFAULT);

	/*
	 *	Return the memory, not zeroed.
	 */
	*addrp = addr;
	return KERN_SUCCESS;
}
Пример #17
0
/*
 *	kmem_realloc:
 *
 *	Reallocate wired-down memory in the kernel's address map
 *	or a submap.  Newly allocated pages are not zeroed.
 *	This can only be used on regions allocated with kmem_alloc.
 *
 *	If successful, the pages in the old region are mapped twice.
 *	The old region is unchanged.  Use kmem_free to get rid of it.
 */
kern_return_t kmem_realloc(
	vm_map_t 	map,
	vm_offset_t 	oldaddr,
	vm_size_t 	oldsize,
	vm_offset_t 	*newaddrp,
	vm_size_t 	newsize)
{
	vm_offset_t oldmin, oldmax;
	vm_offset_t newaddr;
	vm_object_t object;
	vm_map_entry_t oldentry, newentry;
	unsigned int attempts;
	kern_return_t kr;

	oldmin = trunc_page(oldaddr);
	oldmax = round_page(oldaddr + oldsize);
	oldsize = oldmax - oldmin;
	newsize = round_page(newsize);

	/*
	 *	Find space for the new region.
	 */

	attempts = 0;

retry:
	vm_map_lock(map);
	kr = vm_map_find_entry(map, &newaddr, newsize, (vm_offset_t) 0,
			       VM_OBJECT_NULL, &newentry);
	if (kr != KERN_SUCCESS) {
		vm_map_unlock(map);

		if (attempts == 0) {
			attempts++;
			slab_collect();
			goto retry;
		}

		printf_once("no more room for kmem_realloc in %p\n", map);
		return kr;
	}

	/*
	 *	Find the VM object backing the old region.
	 */

	if (!vm_map_lookup_entry(map, oldmin, &oldentry))
		panic("kmem_realloc");
	object = oldentry->object.vm_object;

	/*
	 *	Increase the size of the object and
	 *	fill in the new region.
	 */

	vm_object_reference(object);
	vm_object_lock(object);
	if (object->size != oldsize)
		panic("kmem_realloc");
	object->size = newsize;
	vm_object_unlock(object);

	newentry->object.vm_object = object;
	newentry->offset = 0;

	/*
	 *	Since we have not given out this address yet,
	 *	it is safe to unlock the map.  We are trusting
	 *	that nobody will play with either region.
	 */

	vm_map_unlock(map);

	/*
	 *	Remap the pages in the old region and
	 *	allocate more pages for the new region.
	 */

	kmem_remap_pages(object, 0,
			 newaddr, newaddr + oldsize,
			 VM_PROT_DEFAULT);
	kmem_alloc_pages(object, oldsize,
			 newaddr + oldsize, newaddr + newsize,
			 VM_PROT_DEFAULT);

	*newaddrp = newaddr;
	return KERN_SUCCESS;
}
Пример #18
0
kern_return_t
projected_buffer_map(
	vm_map_t 	map,
	vm_offset_t 	kernel_addr,
	vm_size_t 	size,
	vm_offset_t 	*user_p,
       vm_prot_t 	protection,
       vm_inherit_t 	inheritance)  /*Currently only VM_INHERIT_NONE supported*/
{
	vm_map_entry_t u_entry, k_entry;
	vm_offset_t physical_addr, user_addr;
	vm_size_t r_size;
	kern_return_t kr;

	/*
	 *	Find entry in kernel map 
	 */

	size = round_page(size);
	if (map == VM_MAP_NULL || map == kernel_map ||
	    !vm_map_lookup_entry(kernel_map, kernel_addr, &k_entry) ||
	    kernel_addr + size > k_entry->vme_end)
	  return(KERN_INVALID_ARGUMENT);


	/*
         *     Create entry in user task
         */

	vm_map_lock(map);
	kr = vm_map_find_entry(map, &user_addr, size, (vm_offset_t) 0,
			       VM_OBJECT_NULL, &u_entry);
	if (kr != KERN_SUCCESS) {
	  vm_map_unlock(map);
	  return kr;
	}

	u_entry->object.vm_object = k_entry->object.vm_object;
	vm_object_reference(k_entry->object.vm_object);
	u_entry->offset = kernel_addr - k_entry->vme_start + k_entry->offset;
	u_entry->projected_on = k_entry;
             /*Creates coupling with kernel mapping of the buffer, and
               also guarantees that user cannot directly manipulate
               buffer VM entry*/
	u_entry->protection = protection;
	u_entry->max_protection = protection;
	u_entry->inheritance = inheritance;
	u_entry->wired_count = k_entry->wired_count;
	vm_map_unlock(map);
       	*user_p = user_addr;

	/* Set up physical mappings for user pmap */

	pmap_pageable(map->pmap, user_addr, user_addr + size,
		      !k_entry->wired_count);
	for (r_size = 0; r_size < size; r_size += PAGE_SIZE) {
	  physical_addr = pmap_extract(kernel_pmap, kernel_addr + r_size);
	  pmap_enter(map->pmap, user_addr + r_size, physical_addr,
		     protection, k_entry->wired_count);
	}

	return(KERN_SUCCESS);
}
Пример #19
0
int
proc_rwmem(struct proc *p, struct uio *uio)
{
	struct vmspace *vm;
	vm_map_t map;
	vm_object_t object = NULL;
	vm_offset_t pageno = 0;		/* page number */
	vm_prot_t reqprot;
	vm_offset_t kva;
	int error, writing;

	GIANT_REQUIRED;

	/*
	 * if the vmspace is in the midst of being deallocated or the
	 * process is exiting, don't try to grab anything.  The page table
	 * usage in that process can be messed up.
	 */
	vm = p->p_vmspace;
	if ((p->p_flag & P_WEXIT))
		return (EFAULT);
	if (vm->vm_refcnt < 1)
		return (EFAULT);
	++vm->vm_refcnt;
	/*
	 * The map we want...
	 */
	map = &vm->vm_map;

	writing = uio->uio_rw == UIO_WRITE;
	reqprot = writing ? (VM_PROT_WRITE | VM_PROT_OVERRIDE_WRITE) :
	    VM_PROT_READ;

	kva = kmem_alloc_pageable(kernel_map, PAGE_SIZE);

	/*
	 * Only map in one page at a time.  We don't have to, but it
	 * makes things easier.  This way is trivial - right?
	 */
	do {
		vm_map_t tmap;
		vm_offset_t uva;
		int page_offset;		/* offset into page */
		vm_map_entry_t out_entry;
		vm_prot_t out_prot;
		boolean_t wired;
		vm_pindex_t pindex;
		u_int len;
		vm_page_t m;

		object = NULL;

		uva = (vm_offset_t)uio->uio_offset;

		/*
		 * Get the page number of this segment.
		 */
		pageno = trunc_page(uva);
		page_offset = uva - pageno;

		/*
		 * How many bytes to copy
		 */
		len = min(PAGE_SIZE - page_offset, uio->uio_resid);

		/*
		 * Fault the page on behalf of the process
		 */
		error = vm_fault(map, pageno, reqprot, VM_FAULT_NORMAL);
		if (error) {
			error = EFAULT;
			break;
		}

		/*
		 * Now we need to get the page.  out_entry, out_prot, wired,
		 * and single_use aren't used.  One would think the vm code
		 * would be a *bit* nicer...  We use tmap because
		 * vm_map_lookup() can change the map argument.
		 */
		tmap = map;
		error = vm_map_lookup(&tmap, pageno, reqprot, &out_entry,
		    &object, &pindex, &out_prot, &wired);

		if (error) {
			error = EFAULT;

			/*
			 * Make sure that there is no residue in 'object' from
			 * an error return on vm_map_lookup.
			 */
			object = NULL;

			break;
		}

		m = vm_page_lookup(object, pindex);

		/* Allow fallback to backing objects if we are reading */

		while (m == NULL && !writing && object->backing_object) {

			pindex += OFF_TO_IDX(object->backing_object_offset);
			object = object->backing_object;
			
			m = vm_page_lookup(object, pindex);
		}

		if (m == NULL) {
			error = EFAULT;

			/*
			 * Make sure that there is no residue in 'object' from
			 * an error return on vm_map_lookup.
			 */
			object = NULL;

			vm_map_lookup_done(tmap, out_entry);

			break;
		}

		/*
		 * Wire the page into memory
		 */
		vm_page_lock_queues();
		vm_page_wire(m);
		vm_page_unlock_queues();

		/*
		 * We're done with tmap now.
		 * But reference the object first, so that we won't loose
		 * it.
		 */
		vm_object_reference(object);
		vm_map_lookup_done(tmap, out_entry);

		pmap_qenter(kva, &m, 1);

		/*
		 * Now do the i/o move.
		 */
		error = uiomove((caddr_t)(kva + page_offset), len, uio);

		pmap_qremove(kva, 1);

		/*
		 * release the page and the object
		 */
		vm_page_lock_queues();
		vm_page_unwire(m, 1);
		vm_page_unlock_queues();
		vm_object_deallocate(object);

		object = NULL;

	} while (error == 0 && uio->uio_resid > 0);

	if (object)
		vm_object_deallocate(object);

	kmem_free(kernel_map, kva, PAGE_SIZE);
	vmspace_free(vm);
	return (error);
}