int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, vm_offset_t *offset, struct vm_object **object) { int i; size_t seg_len; vm_paddr_t seg_gpa; vm_object_t seg_obj; for (i = 0; i < vm->num_mem_segs; i++) { if ((seg_obj = vm->mem_segs[i].object) == NULL) continue; seg_gpa = vm->mem_segs[i].gpa; seg_len = vm->mem_segs[i].len; if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { *offset = gpa - seg_gpa; *object = seg_obj; vm_object_reference(seg_obj); return (0); } } return (EINVAL); }
/* * MPSAFE */ static vm_object_t phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff) { vm_object_t object; /* * Offset should be page aligned. */ if (foff & PAGE_MASK) return (NULL); size = round_page(size); if (handle != NULL) { mtx_lock(&Giant); /* * Lock to prevent object creation race condition. */ while (phys_pager_alloc_lock) { phys_pager_alloc_lock = -1; tsleep(&phys_pager_alloc_lock, PVM, "swpalc", 0); } phys_pager_alloc_lock = 1; /* * Look up pager, creating as necessary. */ object = vm_pager_object_lookup(&phys_pager_object_list, handle); if (object == NULL) { /* * Allocate object and associate it with the pager. */ object = vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(foff + size)); object->handle = handle; mtx_lock(&phys_pager_mtx); TAILQ_INSERT_TAIL(&phys_pager_object_list, object, pager_object_list); mtx_unlock(&phys_pager_mtx); } else { /* * Gain a reference to the object. */ vm_object_reference(object); if (OFF_TO_IDX(foff + size) > object->size) object->size = OFF_TO_IDX(foff + size); } if (phys_pager_alloc_lock == -1) wakeup(&phys_pager_alloc_lock); phys_pager_alloc_lock = 0; mtx_unlock(&Giant); } else { object = vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(foff + size)); } return (object); }
/* * mmap() helper to validate mmap() requests against shm object state * and give mmap() the vm_object to use for the mapping. */ int shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff, vm_object_t *obj) { /* * XXXRW: This validation is probably insufficient, and subject to * sign errors. It should be fixed. */ if (foff >= shmfd->shm_size || foff + objsize > round_page(shmfd->shm_size)) return (EINVAL); mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_atime); mtx_unlock(&shm_timestamp_lock); vm_object_reference(shmfd->shm_object); *obj = shmfd->shm_object; return (0); }
void kmem_submap( vm_map_t map, vm_map_t parent, vm_offset_t *min, vm_offset_t *max, vm_size_t size, boolean_t pageable) { vm_offset_t addr; kern_return_t kr; size = round_page(size); /* * Need reference on submap object because it is internal * to the vm_system. vm_object_enter will never be called * on it (usual source of reference for vm_map_enter). */ vm_object_reference(vm_submap_object); addr = vm_map_min(parent); kr = vm_map_enter(parent, &addr, size, (vm_offset_t) 0, TRUE, vm_submap_object, (vm_offset_t) 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); if (kr != KERN_SUCCESS) panic("kmem_submap"); pmap_reference(vm_map_pmap(parent)); vm_map_setup(map, vm_map_pmap(parent), addr, addr + size, pageable); kr = vm_map_submap(parent, addr, addr + size, map); if (kr != KERN_SUCCESS) panic("kmem_submap"); *min = addr; *max = addr + size; }
/** * Get the VM object representing a given memory mapping of the compositor. This * gets or allocates a CFB pool corresponding to the FD being used to perform * the user's mmap() call. If a new FD is mmap()ped, a new CFB pool is allocated * and returned. If the same FD is mmap()ped again, the same CFB pool is * returned. Each vm_object corresponds directly with a CFB pool. * * offset is a guaranteed-page-aligned offset into the FD requested by the user * in their call to mmap(). We may modify it. * size is a guaranteed-page-rounded size for the mapping as requested by the * user in their call to mmap(). */ static int cheri_compositor_cfb_mmap_single(struct cdev *dev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **obj_res, int nprot) { struct cheri_compositor_softc *sc; struct cfb_vm_object *cfb_vm_obj; struct vm_object *vm_obj = NULL; struct file *cdev_fd; struct compositor_cfb_pool *cfb_pool; int error; sc = dev->si_drv1; error = 0; CHERI_COMPOSITOR_DEBUG(sc, "dev: %p, offset: %lu, size: %lu, nprot: %i", dev, *offset, size, nprot); cdev_fd = curthread->td_fpop; KASSERT(cdev_fd != NULL, ("mmap_single td_fpop == NULL")); CHERI_COMPOSITOR_DEBUG(sc, "cdev_fd: %p", cdev_fd); /* Allocate a CFB VM object to associate the cdev with the CFB pool * mapping. Note: The ordering here is fairly sensitive to changes, as * the cdev_pager_allocate() call results in sub-calls to * cheri_compositor_cfb_pg_fault(), which assumes various fields in the * CFB VM object have been initialised. * * The CFB VM object gets destroyed in * cheri_compositor_cfb_pg_dtor(). */ cfb_vm_obj = malloc(sizeof(*cfb_vm_obj), M_CHERI_COMPOSITOR, M_WAITOK | M_ZERO); CHERI_COMPOSITOR_LOCK(sc); /* Find/Allocate a pool mapping for this FD. */ if (dup_or_allocate_cfb_pool_for_cdev_fd(sc, cdev_fd, NULL /* set later */, &cfb_pool) != 0) { free(cfb_vm_obj, M_CHERI_COMPOSITOR); error = ENOMEM; goto done; } /* Update the CFB VM object with the pool mapping and cdev. These have * both been referenced, and the references are transferred to the CFB * VM object. */ cfb_vm_obj->dev = dev; cfb_vm_obj->pool = cfb_pool; /* If a pool had already been allocated for this FD, re-use it. */ if (cfb_pool->vm_obj != NULL) { vm_object_reference(cfb_pool->vm_obj); vm_obj = cfb_pool->vm_obj; goto done; } /* Allocate a device pager VM object. */ vm_obj = cdev_pager_allocate(cfb_vm_obj, OBJT_DEVICE, &cheri_compositor_cfb_pager_ops, size, nprot, *offset, curthread->td_ucred); if (vm_obj == NULL) { CHERI_COMPOSITOR_UNLOCK(sc); cheri_compositor_cfb_pg_dtor(cfb_vm_obj); error = EINVAL; goto done_unlocked; } /* Update the CFB pool to store the VM object. Transfer the reference * from allocation. */ cfb_pool->vm_obj = vm_obj; done: CHERI_COMPOSITOR_UNLOCK(sc); done_unlocked: CHERI_COMPOSITOR_DEBUG(sc, "Finished with vm_obj: %p, cfb_pool: %p (retval: %u).", vm_obj, cfb_pool, error); *obj_res = vm_obj; /* Don't need to modify the offset. It was originally passed by the user * as an offset from the start of the cdev FD. Since the cdev FD maps * directly to a CFB pool/VM object, the offset becomes an offset from * the start of the CFB pool/VM object. */ return (error); }
kern_return_t kmem_alloc_contig( vm_map_t map, vm_offset_t *addrp, vm_size_t size, vm_offset_t mask, ppnum_t max_pnum, ppnum_t pnum_mask, int flags) { vm_object_t object; vm_object_offset_t offset; vm_map_offset_t map_addr; vm_map_offset_t map_mask; vm_map_size_t map_size, i; vm_map_entry_t entry; vm_page_t m, pages; kern_return_t kr; if (map == VM_MAP_NULL || (flags & ~(KMA_KOBJECT | KMA_LOMEM | KMA_NOPAGEWAIT))) return KERN_INVALID_ARGUMENT; if (size == 0) { *addrp = 0; return KERN_INVALID_ARGUMENT; } map_size = vm_map_round_page(size); map_mask = (vm_map_offset_t)mask; /* * Allocate a new object (if necessary) and the reference we * will be donating to the map entry. We must do this before * locking the map, or risk deadlock with the default pager. */ if ((flags & KMA_KOBJECT) != 0) { object = kernel_object; vm_object_reference(object); } else { object = vm_object_allocate(map_size); } kr = vm_map_find_space(map, &map_addr, map_size, map_mask, 0, &entry); if (KERN_SUCCESS != kr) { vm_object_deallocate(object); return kr; } entry->object.vm_object = object; entry->offset = offset = (object == kernel_object) ? map_addr : 0; /* Take an extra object ref in case the map entry gets deleted */ vm_object_reference(object); vm_map_unlock(map); kr = cpm_allocate(CAST_DOWN(vm_size_t, map_size), &pages, max_pnum, pnum_mask, FALSE, flags); if (kr != KERN_SUCCESS) { vm_map_remove(map, vm_map_trunc_page(map_addr), vm_map_round_page(map_addr + map_size), 0); vm_object_deallocate(object); *addrp = 0; return kr; } vm_object_lock(object); for (i = 0; i < map_size; i += PAGE_SIZE) { m = pages; pages = NEXT_PAGE(m); *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL; m->busy = FALSE; vm_page_insert(m, object, offset + i); } vm_object_unlock(object); if ((kr = vm_map_wire(map, vm_map_trunc_page(map_addr), vm_map_round_page(map_addr + map_size), VM_PROT_DEFAULT, FALSE)) != KERN_SUCCESS) { if (object == kernel_object) { vm_object_lock(object); vm_object_page_remove(object, offset, offset + map_size); vm_object_unlock(object); } vm_map_remove(map, vm_map_trunc_page(map_addr), vm_map_round_page(map_addr + map_size), 0); vm_object_deallocate(object); return kr; } vm_object_deallocate(object); if (object == kernel_object) vm_map_simplify(map, map_addr); *addrp = (vm_offset_t) map_addr; assert((vm_map_offset_t) *addrp == map_addr); return KERN_SUCCESS; }
/* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p) { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *oldcred; struct uidinfo *euip = NULL; register_t *stack_base; int error, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts = NULL, *newsigacts = NULL; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *oldtextvp = NULL, *newtextvp; int credential_changing; int textset; #ifdef MAC struct label *interpvplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ bzero(imgp, sizeof(*imgp)); imgp->proc = p; imgp->attr = &attr; imgp->args = args; oldcred = p->p_ucred; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif /* * Translate the file name. namei() returns a vnode pointer * in ni_vp among other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ if (args->fname != NULL) { NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); } SDT_PROBE1(proc, , , exec, args->fname); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif error = namei(&nd); if (error) goto exec_fail; newtextvp = nd.ni_vp; imgp->vp = newtextvp; } else { AUDIT_ARG_FD(args->fd); /* * Descriptors opened only with O_EXEC or O_RDONLY are allowed. */ error = fgetvp_exec(td, args->fd, &cap_fexecve_rights, &newtextvp); if (error) goto exec_fail; vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = VOP_IS_TEXT(imgp->vp); VOP_SET_TEXT(imgp->vp); error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; imgp->proc->p_fctl0 = 0; /* * Implement image setuid/setgid. * * Determine new credentials before attempting image activators * so that it can be used by process_exec handlers to determine * credential/setid changes. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in capability mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = 0; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp); credential_changing |= will_transition; #endif /* Don't inherit PROC_PDEATHSIG_CTL value if setuid/setgid. */ if (credential_changing) imgp->proc->p_pdeathsig = 0; if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { imgp->credential_setid = true; VOP_UNLOCK(imgp->vp, 0); imgp->newcred = crdup(oldcred); if (attr.va_mode & S_ISUID) { euip = uifind(attr.va_uid); change_euid(imgp->newcred, euip); } vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (attr.va_mode & S_ISGID) change_egid(imgp->newcred, attr.va_gid); /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } else { /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { VOP_UNLOCK(imgp->vp, 0); imgp->newcred = crdup(oldcred); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } } /* The new credentials are installed into the process later. */ /* * Do the best to calculate the full path to the image file. */ if (args->fname != NULL && args->fname[0] == '/') imgp->execpath = args->fname; else { VOP_UNLOCK(imgp->vp, 0); if (vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0) imgp->execpath = args->fname; vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); } /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) VOP_UNSET_TEXT(imgp->vp); error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ VOP_UNSET_TEXT(imgp->vp); /* free name buffer and old vnode */ if (args->fname != NULL) NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC mac_execve_interpreter_enter(newtextvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td); imgp->opened = 0; } vput(newtextvp); vm_object_deallocate(imgp->object); imgp->object = NULL; imgp->credential_setid = false; if (imgp->newcred != NULL) { crfree(imgp->newcred); imgp->newcred = NULL; } imgp->execpath = NULL; free(imgp->freepath, M_TEMP); imgp->freepath = NULL; /* set new name to that of the interpreter */ NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); args->fname = imgp->interpreter_name; goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp, 0); if (disallow_high_osrel && P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { error = ENOEXEC; uprintf("Osrel %d for image %s too high\n", p->p_osrel, imgp->execpath != NULL ? imgp->execpath : "<unresolved>"); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* ABI enforces the use of Capsicum. Switch into capabilities mode. */ if (SV_PROC_FLAG(p, SV_CAPSICUM)) sys_cap_enter(td, NULL); /* * Copy out strings (args and env) and initialize stack base. */ stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); /* * Stack setup. */ error = (*p->p_sysent->sv_fixup)(&stack_base, imgp); if (error != 0) { vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } if (args->fdp != NULL) { /* Install a brand new file descriptor table. */ fdinstall_remapped(td, args->fdp); args->fdp = NULL; } else { /* * Keep on using the existing file descriptor table. For * security and other reasons, the file descriptor table * cannot be shared after an exec. */ fdunshare(td); /* close files on exec */ fdcloseexec(td); } /* * Malloc things before we need locks. */ i = exec_args_get_begin_envv(imgp->args) - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); if (oldsigacts) p->p_sigacts = newsigacts; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) p->p_flag2 &= ~P2_NOTRACE; if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); /* STOPs are no longer ignored, arrange for AST */ signotify(td); } /* * Implement image setuid/setgid installation. */ if (imgp->credential_setid) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracecred != NULL && priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED)) ktrprocexec(p, &tracecred, &tracevp); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * Both fdsetugidsafety() and fdcheckstd() may call functions * taking sleepable locks, so temporarily drop our locks. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); fdsetugidsafety(td); error = fdcheckstd(td); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto exec_fail_dealloc; PROC_LOCK(p); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, imgp->newcred, imgp->vp, interpvplabel, imgp); } #endif } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; } /* * Set the new credentials. */ if (imgp->newcred != NULL) { proc_set_cred(p, imgp->newcred); crfree(oldcred); oldcred = NULL; } /* * Store the vp for use in procfs. This vnode was referenced by namei * or fgetvp_exec. */ oldtextvp = p->p_textvp; p->p_textvp = newtextvp; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; PROC_UNLOCK(p); #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { VOP_UNLOCK(imgp->vp, 0); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } #endif /* Set values passed into the program in registers. */ (*p->p_sysent->sv_setregs)(td, imgp, (u_long)(uintptr_t)stack_base); vfs_mark_atime(imgp->vp, td->td_ucred); SDT_PROBE1(proc, , , exec__success, args->fname); exec_fail_dealloc: if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (args->fname) NDFREE(&nd, NDF_ONLY_PNBUF); if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); if (error != 0) vput(imgp->vp); else VOP_UNLOCK(imgp->vp, 0); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { if (p->p_ptevents & PTRACE_EXEC) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_EXEC) td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); } /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); } else { exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); SDT_PROBE1(proc, , , exec__failure, error); } if (imgp->newcred != NULL && oldcred != NULL) crfree(imgp->newcred); #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif exec_free_args(args); /* * Handle deferred decrement of ref counts. */ if (oldtextvp != NULL) vrele(oldtextvp); #ifdef KTRACE if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); if (euip != NULL) uifree(euip); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, 0, SIGABRT); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif /* * We don't want cpu_set_syscall_retval() to overwrite any of * the register values put in place by exec_setregs(). * Implementations of cpu_set_syscall_retval() will leave * registers unmodified when returning EJUSTRETURN. */ return (error == 0 ? EJUSTRETURN : error); }
static int link_elf_obj_load_file(const char *filename, linker_file_t * result) { struct nlookupdata nd; struct thread *td = curthread; /* XXX */ struct proc *p = td->td_proc; char *pathname; struct vnode *vp; Elf_Ehdr *hdr; Elf_Shdr *shdr; Elf_Sym *es; int nbytes, i, j; vm_offset_t mapbase; size_t mapsize; int error = 0; int resid; elf_file_t ef; linker_file_t lf; int symtabindex; int symstrindex; int shstrindex; int nsym; int pb, rl, ra; int alignmask; /* XXX Hack for firmware loading where p == NULL */ if (p == NULL) { p = &proc0; } KKASSERT(p != NULL); if (p->p_ucred == NULL) { kprintf("link_elf_obj_load_file: cannot load '%s' from filesystem" " this early\n", filename); return ENOENT; } shdr = NULL; lf = NULL; mapsize = 0; hdr = NULL; pathname = linker_search_path(filename); if (pathname == NULL) return ENOENT; error = nlookup_init(&nd, pathname, UIO_SYSSPACE, NLC_FOLLOW | NLC_LOCKVP); if (error == 0) error = vn_open(&nd, NULL, FREAD, 0); kfree(pathname, M_LINKER); if (error) { nlookup_done(&nd); return error; } vp = nd.nl_open_vp; nd.nl_open_vp = NULL; nlookup_done(&nd); /* * Read the elf header from the file. */ hdr = kmalloc(sizeof(*hdr), M_LINKER, M_WAITOK); if (hdr == NULL) { error = ENOMEM; goto out; } error = vn_rdwr(UIO_READ, vp, (void *)hdr, sizeof(*hdr), 0, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = ENOEXEC; goto out; } if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { link_elf_obj_error(filename, "Unsupported file layout"); error = ENOEXEC; goto out; } if (hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT) { link_elf_obj_error(filename, "Unsupported file version"); error = ENOEXEC; goto out; } if (hdr->e_type != ET_REL) { error = ENOSYS; goto out; } if (hdr->e_machine != ELF_TARG_MACH) { link_elf_obj_error(filename, "Unsupported machine"); error = ENOEXEC; goto out; } ef = kmalloc(sizeof(struct elf_file), M_LINKER, M_WAITOK | M_ZERO); lf = linker_make_file(filename, ef, &link_elf_obj_file_ops); if (lf == NULL) { kfree(ef, M_LINKER); error = ENOMEM; goto out; } ef->nprogtab = 0; ef->e_shdr = 0; ef->nreltab = 0; ef->nrelatab = 0; /* Allocate and read in the section header */ nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0 || hdr->e_shentsize != sizeof(Elf_Shdr)) { error = ENOEXEC; goto out; } shdr = kmalloc(nbytes, M_LINKER, M_WAITOK); if (shdr == NULL) { error = ENOMEM; goto out; } ef->e_shdr = shdr; error = vn_rdwr(UIO_READ, vp, (caddr_t) shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid) { error = ENOEXEC; goto out; } /* Scan the section header for information and table sizing. */ nsym = 0; symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: ef->nprogtab++; break; case SHT_SYMTAB: nsym++; symtabindex = i; symstrindex = shdr[i].sh_link; break; case SHT_REL: ef->nreltab++; break; case SHT_RELA: ef->nrelatab++; break; case SHT_STRTAB: break; } } if (ef->nprogtab == 0) { link_elf_obj_error(filename, "file has no contents"); error = ENOEXEC; goto out; } if (nsym != 1) { /* Only allow one symbol table for now */ link_elf_obj_error(filename, "file has no valid symbol table"); error = ENOEXEC; goto out; } if (symstrindex < 0 || symstrindex > hdr->e_shnum || shdr[symstrindex].sh_type != SHT_STRTAB) { link_elf_obj_error(filename, "file has invalid symbol strings"); error = ENOEXEC; goto out; } /* Allocate space for tracking the load chunks */ if (ef->nprogtab != 0) ef->progtab = kmalloc(ef->nprogtab * sizeof(*ef->progtab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nreltab != 0) ef->reltab = kmalloc(ef->nreltab * sizeof(*ef->reltab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nrelatab != 0) ef->relatab = kmalloc(ef->nrelatab * sizeof(*ef->relatab), M_LINKER, M_WAITOK | M_ZERO); if ((ef->nprogtab != 0 && ef->progtab == NULL) || (ef->nreltab != 0 && ef->reltab == NULL) || (ef->nrelatab != 0 && ef->relatab == NULL)) { error = ENOMEM; goto out; } if (symtabindex == -1) panic("lost symbol table index"); /* Allocate space for and load the symbol table */ ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym); ef->ddbsymtab = kmalloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK); if (ef->ddbsymtab == NULL) { error = ENOMEM; goto out; } error = vn_rdwr(UIO_READ, vp, (void *)ef->ddbsymtab, shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } if (symstrindex == -1) panic("lost symbol string index"); /* Allocate space for and load the symbol strings */ ef->ddbstrcnt = shdr[symstrindex].sh_size; ef->ddbstrtab = kmalloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK); if (ef->ddbstrtab == NULL) { error = ENOMEM; goto out; } error = vn_rdwr(UIO_READ, vp, ef->ddbstrtab, shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } /* Do we have a string table for the section names? */ shstrindex = -1; if (hdr->e_shstrndx != 0 && shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) { shstrindex = hdr->e_shstrndx; ef->shstrcnt = shdr[shstrindex].sh_size; ef->shstrtab = kmalloc(shdr[shstrindex].sh_size, M_LINKER, M_WAITOK); if (ef->shstrtab == NULL) { error = ENOMEM; goto out; } error = vn_rdwr(UIO_READ, vp, ef->shstrtab, shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } } /* Size up code/data(progbits) and bss(nobits). */ alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: alignmask = shdr[i].sh_addralign - 1; mapsize += alignmask; mapsize &= ~alignmask; mapsize += shdr[i].sh_size; break; } } /* * We know how much space we need for the text/data/bss/etc. This * stuff needs to be in a single chunk so that profiling etc can get * the bounds and gdb can associate offsets with modules */ ef->object = vm_object_allocate(OBJT_DEFAULT, round_page(mapsize) >> PAGE_SHIFT); if (ef->object == NULL) { error = ENOMEM; goto out; } vm_object_reference(ef->object); ef->address = (caddr_t) vm_map_min(&kernel_map); ef->bytes = 0; /* * In order to satisfy x86_64's architectural requirements on the * location of code and data in the kernel's address space, request a * mapping that is above the kernel. * * vkernel64's text+data is outside the managed VM space entirely. */ #if defined(__amd64__) && defined(_KERNEL_VIRTUAL) error = vkernel_module_memory_alloc(&mapbase, round_page(mapsize)); #else mapbase = KERNBASE; error = vm_map_find(&kernel_map, ef->object, 0, &mapbase, round_page(mapsize), PAGE_SIZE, TRUE, VM_MAPTYPE_NORMAL, VM_PROT_ALL, VM_PROT_ALL, FALSE); if (error) { vm_object_deallocate(ef->object); ef->object = NULL; goto out; } /* Wire the pages */ error = vm_map_wire(&kernel_map, mapbase, mapbase + round_page(mapsize), 0); #endif if (error != KERN_SUCCESS) { error = ENOMEM; goto out; } /* Inform the kld system about the situation */ lf->address = ef->address = (caddr_t) mapbase; lf->size = round_page(mapsize); ef->bytes = mapsize; /* * Now load code/data(progbits), zero bss(nobits), allocate space for * and load relocs */ pb = 0; rl = 0; ra = 0; alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: alignmask = shdr[i].sh_addralign - 1; mapbase += alignmask; mapbase &= ~alignmask; if (ef->shstrtab && shdr[i].sh_name != 0) ef->progtab[pb].name = ef->shstrtab + shdr[i].sh_name; else if (shdr[i].sh_type == SHT_PROGBITS) ef->progtab[pb].name = "<<PROGBITS>>"; else ef->progtab[pb].name = "<<NOBITS>>"; #if 0 if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, "set_pcpu")) ef->progtab[pb].addr = dpcpu_alloc(shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) ef->progtab[pb].addr = vnet_data_alloc(shdr[i].sh_size); #endif else #endif ef->progtab[pb].addr = (void *)(uintptr_t) mapbase; if (ef->progtab[pb].addr == NULL) { error = ENOSPC; goto out; } ef->progtab[pb].size = shdr[i].sh_size; ef->progtab[pb].sec = i; if (shdr[i].sh_type == SHT_PROGBITS) { error = vn_rdwr(UIO_READ, vp, ef->progtab[pb].addr, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } #if 0 /* Initialize the per-cpu or vnet area. */ if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, "set_pcpu")) dpcpu_copy(ef->progtab[pb].addr, shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) vnet_data_copy(ef->progtab[pb].addr, shdr[i].sh_size); #endif #endif } else bzero(ef->progtab[pb].addr, shdr[i].sh_size); /* Update all symbol values with the offset. */ for (j = 0; j < ef->ddbsymcnt; j++) { es = &ef->ddbsymtab[j]; if (es->st_shndx != i) continue; es->st_value += (Elf_Addr) ef->progtab[pb].addr; } mapbase += shdr[i].sh_size; pb++; break; case SHT_REL: ef->reltab[rl].rel = kmalloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel); ef->reltab[rl].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, vp, (void *)ef->reltab[rl].rel, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } rl++; break; case SHT_RELA: ef->relatab[ra].rela = kmalloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela); ef->relatab[ra].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, vp, (void *)ef->relatab[ra].rela, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } ra++; break; } } if (pb != ef->nprogtab) panic("lost progbits"); if (rl != ef->nreltab) panic("lost reltab"); if (ra != ef->nrelatab) panic("lost relatab"); if (mapbase != (vm_offset_t) ef->address + mapsize) panic("mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n", mapbase, ef->address, mapsize, (vm_offset_t) ef->address + mapsize); /* Local intra-module relocations */ link_elf_obj_reloc_local(lf); /* Pull in dependencies */ error = linker_load_dependencies(lf); if (error) goto out; /* External relocations */ error = relocate_file(lf); if (error) goto out; *result = lf; out: if (error && lf) linker_file_unload(lf /*, LINKER_UNLOAD_FORCE */); if (hdr) kfree(hdr, M_LINKER); vn_unlock(vp); vn_close(vp, FREAD); return error; }
/* * vm_contig_pg_kmap: * * Map previously allocated (vm_contig_pg_alloc) range of pages from * vm_page_array[] into the KVA. Once mapped, the pages are part of * the Kernel, and are to free'ed with kmem_free(&kernel_map, addr, size). * * No requirements. */ vm_offset_t vm_contig_pg_kmap(int start, u_long size, vm_map_t map, int flags) { vm_offset_t addr, tmp_addr; vm_page_t pga = vm_page_array; int i, count; size = round_page(size); if (size == 0) panic("vm_contig_pg_kmap: size must not be 0"); crit_enter(); lwkt_gettoken(&vm_token); /* * We've found a contiguous chunk that meets our requirements. * Allocate KVM, and assign phys pages and return a kernel VM * pointer. */ count = vm_map_entry_reserve(MAP_RESERVE_COUNT); vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), size, PAGE_SIZE, 0, &addr) != KERN_SUCCESS) { /* * XXX We almost never run out of kernel virtual * space, so we don't make the allocated memory * above available. */ vm_map_unlock(map); vm_map_entry_release(count); lwkt_reltoken(&vm_token); crit_exit(); return (0); } /* * kernel_object maps 1:1 to kernel_map. */ vm_object_hold(&kernel_object); vm_object_reference(&kernel_object); vm_map_insert(map, &count, &kernel_object, addr, addr, addr + size, VM_MAPTYPE_NORMAL, VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); vm_map_entry_release(count); tmp_addr = addr; for (i = start; i < (start + size / PAGE_SIZE); i++) { vm_page_t m = &pga[i]; vm_page_insert(m, &kernel_object, OFF_TO_IDX(tmp_addr)); if ((flags & M_ZERO) && !(m->flags & PG_ZERO)) pmap_zero_page(VM_PAGE_TO_PHYS(m)); m->flags = 0; tmp_addr += PAGE_SIZE; } vm_map_wire(map, addr, addr + size, 0); vm_object_drop(&kernel_object); lwkt_reltoken(&vm_token); crit_exit(); return (addr); }
/* * Destroy old address space, and allocate a new stack. * The new stack is only sgrowsiz large because it is grown * automatically on a page fault. */ int exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv) { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_object_t obj; struct rlimit rlim_stack; vm_offset_t sv_minuser, stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser && cpu_exec_vmspace_reuse(p, map)) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); /* * An exec terminates mlockall(MCL_FUTURE), ASLR state * must be re-evaluated. */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE | MAP_ASLR | MAP_ASLR_IGNSTART); vm_map_unlock(map); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } map->flags |= imgp->map_flags; /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj != NULL) { vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, sv->sv_shared_page_base, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); if (error != KERN_SUCCESS) { vm_object_deallocate(obj); return (vm_mmap_to_errno(error)); } } /* Allocate a new stack */ if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error != KERN_SUCCESS) return (vm_mmap_to_errno(error)); /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)stack_addr; return (0); }
DECLHIDDEN(int) rtR0MemObjNativeMapKernel(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJ pMemToMap, void *pvFixed, size_t uAlignment, unsigned fProt, size_t offSub, size_t cbSub) { // AssertMsgReturn(!offSub && !cbSub, ("%#x %#x\n", offSub, cbSub), VERR_NOT_SUPPORTED); AssertMsgReturn(pvFixed == (void *)-1, ("%p\n", pvFixed), VERR_NOT_SUPPORTED); /* * Check that the specified alignment is supported. */ if (uAlignment > PAGE_SIZE) return VERR_NOT_SUPPORTED; int rc; PRTR0MEMOBJFREEBSD pMemToMapFreeBSD = (PRTR0MEMOBJFREEBSD)pMemToMap; /* calc protection */ vm_prot_t ProtectionFlags = 0; if ((fProt & RTMEM_PROT_NONE) == RTMEM_PROT_NONE) ProtectionFlags = VM_PROT_NONE; if ((fProt & RTMEM_PROT_READ) == RTMEM_PROT_READ) ProtectionFlags |= VM_PROT_READ; if ((fProt & RTMEM_PROT_WRITE) == RTMEM_PROT_WRITE) ProtectionFlags |= VM_PROT_WRITE; if ((fProt & RTMEM_PROT_EXEC) == RTMEM_PROT_EXEC) ProtectionFlags |= VM_PROT_EXECUTE; vm_offset_t Addr = vm_map_min(kernel_map); if (cbSub == 0) cbSub = pMemToMap->cb - offSub; vm_object_reference(pMemToMapFreeBSD->pObject); rc = vm_map_find(kernel_map, /* Map to insert the object in */ pMemToMapFreeBSD->pObject, /* Object to map */ offSub, /* Start offset in the object */ &Addr, /* Start address IN/OUT */ cbSub, /* Size of the mapping */ #if __FreeBSD_version >= 1000055 0, /* Upper bound of mapping */ #endif VMFS_ANY_SPACE, /* Whether a suitable address should be searched for first */ ProtectionFlags, /* protection flags */ VM_PROT_ALL, /* Maximum protection flags */ 0); /* copy-on-write and similar flags */ if (rc == KERN_SUCCESS) { rc = vm_map_wire(kernel_map, Addr, Addr + cbSub, VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); AssertMsg(rc == KERN_SUCCESS, ("%#x\n", rc)); PRTR0MEMOBJFREEBSD pMemFreeBSD = (PRTR0MEMOBJFREEBSD)rtR0MemObjNew(sizeof(RTR0MEMOBJFREEBSD), RTR0MEMOBJTYPE_MAPPING, (void *)Addr, cbSub); if (pMemFreeBSD) { Assert((vm_offset_t)pMemFreeBSD->Core.pv == Addr); pMemFreeBSD->Core.u.Mapping.R0Process = NIL_RTR0PROCESS; *ppMem = &pMemFreeBSD->Core; return VINF_SUCCESS; } rc = vm_map_remove(kernel_map, Addr, Addr + cbSub); AssertMsg(rc == KERN_SUCCESS, ("Deleting mapping failed\n")); } else vm_object_deallocate(pMemToMapFreeBSD->pObject); return VERR_NO_MEMORY; }
DECLHIDDEN(int) rtR0MemObjNativeMapUser(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJ pMemToMap, RTR3PTR R3PtrFixed, size_t uAlignment, unsigned fProt, RTR0PROCESS R0Process) { /* * Check for unsupported stuff. */ AssertMsgReturn(R0Process == RTR0ProcHandleSelf(), ("%p != %p\n", R0Process, RTR0ProcHandleSelf()), VERR_NOT_SUPPORTED); if (uAlignment > PAGE_SIZE) return VERR_NOT_SUPPORTED; int rc; PRTR0MEMOBJFREEBSD pMemToMapFreeBSD = (PRTR0MEMOBJFREEBSD)pMemToMap; struct proc *pProc = (struct proc *)R0Process; struct vm_map *pProcMap = &pProc->p_vmspace->vm_map; /* calc protection */ vm_prot_t ProtectionFlags = 0; if ((fProt & RTMEM_PROT_NONE) == RTMEM_PROT_NONE) ProtectionFlags = VM_PROT_NONE; if ((fProt & RTMEM_PROT_READ) == RTMEM_PROT_READ) ProtectionFlags |= VM_PROT_READ; if ((fProt & RTMEM_PROT_WRITE) == RTMEM_PROT_WRITE) ProtectionFlags |= VM_PROT_WRITE; if ((fProt & RTMEM_PROT_EXEC) == RTMEM_PROT_EXEC) ProtectionFlags |= VM_PROT_EXECUTE; /* calc mapping address */ vm_offset_t AddrR3; if (R3PtrFixed == (RTR3PTR)-1) { /** @todo: is this needed?. */ PROC_LOCK(pProc); AddrR3 = round_page((vm_offset_t)pProc->p_vmspace->vm_daddr + lim_max(pProc, RLIMIT_DATA)); PROC_UNLOCK(pProc); } else AddrR3 = (vm_offset_t)R3PtrFixed; /* Insert the pObject in the map. */ vm_object_reference(pMemToMapFreeBSD->pObject); rc = vm_map_find(pProcMap, /* Map to insert the object in */ pMemToMapFreeBSD->pObject, /* Object to map */ 0, /* Start offset in the object */ &AddrR3, /* Start address IN/OUT */ pMemToMap->cb, /* Size of the mapping */ #if __FreeBSD_version >= 1000055 0, /* Upper bound of the mapping */ #endif R3PtrFixed == (RTR3PTR)-1 ? VMFS_ANY_SPACE : VMFS_NO_SPACE, /* Whether a suitable address should be searched for first */ ProtectionFlags, /* protection flags */ VM_PROT_ALL, /* Maximum protection flags */ 0); /* copy-on-write and similar flags */ if (rc == KERN_SUCCESS) { rc = vm_map_wire(pProcMap, AddrR3, AddrR3 + pMemToMap->cb, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES); AssertMsg(rc == KERN_SUCCESS, ("%#x\n", rc)); rc = vm_map_inherit(pProcMap, AddrR3, AddrR3 + pMemToMap->cb, VM_INHERIT_SHARE); AssertMsg(rc == KERN_SUCCESS, ("%#x\n", rc)); /* * Create a mapping object for it. */ PRTR0MEMOBJFREEBSD pMemFreeBSD = (PRTR0MEMOBJFREEBSD)rtR0MemObjNew(sizeof(RTR0MEMOBJFREEBSD), RTR0MEMOBJTYPE_MAPPING, (void *)AddrR3, pMemToMap->cb); if (pMemFreeBSD) { Assert((vm_offset_t)pMemFreeBSD->Core.pv == AddrR3); pMemFreeBSD->Core.u.Mapping.R0Process = R0Process; *ppMem = &pMemFreeBSD->Core; return VINF_SUCCESS; } rc = vm_map_remove(pProcMap, AddrR3, AddrR3 + pMemToMap->cb); AssertMsg(rc == KERN_SUCCESS, ("Deleting mapping failed\n")); } else vm_object_deallocate(pMemToMapFreeBSD->pObject); return VERR_NO_MEMORY; }
kern_return_t kernel_memory_allocate( register vm_map_t map, register vm_offset_t *addrp, register vm_size_t size, register vm_offset_t mask, int flags) { vm_object_t object; vm_object_offset_t offset; vm_object_offset_t pg_offset; vm_map_entry_t entry; vm_map_offset_t map_addr, fill_start; vm_map_offset_t map_mask; vm_map_size_t map_size, fill_size; kern_return_t kr; vm_page_t mem; vm_page_t guard_page_list = NULL; vm_page_t wired_page_list = NULL; int guard_page_count = 0; int wired_page_count = 0; int i; int vm_alloc_flags; if (! vm_kernel_ready) { panic("kernel_memory_allocate: VM is not ready"); } if (size == 0) { *addrp = 0; return KERN_INVALID_ARGUMENT; } map_size = vm_map_round_page(size); map_mask = (vm_map_offset_t) mask; vm_alloc_flags = 0; /* * limit the size of a single extent of wired memory * to try and limit the damage to the system if * too many pages get wired down */ if (map_size > (1 << 30)) { return KERN_RESOURCE_SHORTAGE; } /* * Guard pages: * * Guard pages are implemented as ficticious pages. By placing guard pages * on either end of a stack, they can help detect cases where a thread walks * off either end of its stack. They are allocated and set up here and attempts * to access those pages are trapped in vm_fault_page(). * * The map_size we were passed may include extra space for * guard pages. If those were requested, then back it out of fill_size * since vm_map_find_space() takes just the actual size not including * guard pages. Similarly, fill_start indicates where the actual pages * will begin in the range. */ fill_start = 0; fill_size = map_size; if (flags & KMA_GUARD_FIRST) { vm_alloc_flags |= VM_FLAGS_GUARD_BEFORE; fill_start += PAGE_SIZE_64; fill_size -= PAGE_SIZE_64; if (map_size < fill_start + fill_size) { /* no space for a guard page */ *addrp = 0; return KERN_INVALID_ARGUMENT; } guard_page_count++; } if (flags & KMA_GUARD_LAST) { vm_alloc_flags |= VM_FLAGS_GUARD_AFTER; fill_size -= PAGE_SIZE_64; if (map_size <= fill_start + fill_size) { /* no space for a guard page */ *addrp = 0; return KERN_INVALID_ARGUMENT; } guard_page_count++; } wired_page_count = (int) (fill_size / PAGE_SIZE_64); assert(wired_page_count * PAGE_SIZE_64 == fill_size); for (i = 0; i < guard_page_count; i++) { for (;;) { mem = vm_page_grab_guard(); if (mem != VM_PAGE_NULL) break; if (flags & KMA_NOPAGEWAIT) { kr = KERN_RESOURCE_SHORTAGE; goto out; } vm_page_more_fictitious(); } mem->pageq.next = (queue_entry_t)guard_page_list; guard_page_list = mem; } for (i = 0; i < wired_page_count; i++) { uint64_t unavailable; for (;;) { if (flags & KMA_LOMEM) mem = vm_page_grablo(); else mem = vm_page_grab(); if (mem != VM_PAGE_NULL) break; if (flags & KMA_NOPAGEWAIT) { kr = KERN_RESOURCE_SHORTAGE; goto out; } if ((flags & KMA_LOMEM) && (vm_lopage_needed == TRUE)) { kr = KERN_RESOURCE_SHORTAGE; goto out; } unavailable = (vm_page_wire_count + vm_page_free_target) * PAGE_SIZE; if (unavailable > max_mem || map_size > (max_mem - unavailable)) { kr = KERN_RESOURCE_SHORTAGE; goto out; } VM_PAGE_WAIT(); } mem->pageq.next = (queue_entry_t)wired_page_list; wired_page_list = mem; } /* * Allocate a new object (if necessary). We must do this before * locking the map, or risk deadlock with the default pager. */ if ((flags & KMA_KOBJECT) != 0) { object = kernel_object; vm_object_reference(object); } else { object = vm_object_allocate(map_size); } kr = vm_map_find_space(map, &map_addr, fill_size, map_mask, vm_alloc_flags, &entry); if (KERN_SUCCESS != kr) { vm_object_deallocate(object); goto out; } entry->object.vm_object = object; entry->offset = offset = (object == kernel_object) ? map_addr : 0; entry->wired_count++; if (flags & KMA_PERMANENT) entry->permanent = TRUE; if (object != kernel_object) vm_object_reference(object); vm_object_lock(object); vm_map_unlock(map); pg_offset = 0; if (fill_start) { if (guard_page_list == NULL) panic("kernel_memory_allocate: guard_page_list == NULL"); mem = guard_page_list; guard_page_list = (vm_page_t)mem->pageq.next; mem->pageq.next = NULL; vm_page_insert(mem, object, offset + pg_offset); mem->busy = FALSE; pg_offset += PAGE_SIZE_64; } for (pg_offset = fill_start; pg_offset < fill_start + fill_size; pg_offset += PAGE_SIZE_64) { if (wired_page_list == NULL) panic("kernel_memory_allocate: wired_page_list == NULL"); mem = wired_page_list; wired_page_list = (vm_page_t)mem->pageq.next; mem->pageq.next = NULL; mem->wire_count++; vm_page_insert(mem, object, offset + pg_offset); mem->busy = FALSE; mem->pmapped = TRUE; mem->wpmapped = TRUE; PMAP_ENTER(kernel_pmap, map_addr + pg_offset, mem, VM_PROT_READ | VM_PROT_WRITE, object->wimg_bits & VM_WIMG_MASK, TRUE); if (flags & KMA_NOENCRYPT) { bzero(CAST_DOWN(void *, (map_addr + pg_offset)), PAGE_SIZE); pmap_set_noencrypt(mem->phys_page); } }
static void mac_proc_vm_revoke_recurse(struct thread *td, struct ucred *cred, struct vm_map *map) { vm_map_entry_t vme; int vfslocked, result; vm_prot_t revokeperms; vm_object_t backing_object, object; vm_ooffset_t offset; struct vnode *vp; struct mount *mp; if (!mac_mmap_revocation) return; vm_map_lock(map); for (vme = map->header.next; vme != &map->header; vme = vme->next) { if (vme->eflags & MAP_ENTRY_IS_SUB_MAP) { mac_proc_vm_revoke_recurse(td, cred, vme->object.sub_map); continue; } /* * Skip over entries that obviously are not shared. */ if (vme->eflags & (MAP_ENTRY_COW | MAP_ENTRY_NOSYNC) || !vme->max_protection) continue; /* * Drill down to the deepest backing object. */ offset = vme->offset; object = vme->object.vm_object; if (object == NULL) continue; VM_OBJECT_LOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_LOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_UNLOCK(object); object = backing_object; } VM_OBJECT_UNLOCK(object); /* * At the moment, vm_maps and objects aren't considered by * the MAC system, so only things with backing by a normal * object (read: vnodes) are checked. */ if (object->type != OBJT_VNODE) continue; vp = (struct vnode *)object->handle; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); result = vme->max_protection; mac_vnode_check_mmap_downgrade(cred, vp, &result); VOP_UNLOCK(vp, 0); /* * Find out what maximum protection we may be allowing now * but a policy needs to get removed. */ revokeperms = vme->max_protection & ~result; if (!revokeperms) { VFS_UNLOCK_GIANT(vfslocked); continue; } printf("pid %ld: revoking %s perms from %#lx:%ld " "(max %s/cur %s)\n", (long)td->td_proc->p_pid, prot2str(revokeperms), (u_long)vme->start, (long)(vme->end - vme->start), prot2str(vme->max_protection), prot2str(vme->protection)); /* * This is the really simple case: if a map has more * max_protection than is allowed, but it's not being * actually used (that is, the current protection is still * allowed), we can just wipe it out and do nothing more. */ if ((vme->protection & revokeperms) == 0) { vme->max_protection -= revokeperms; } else { if (revokeperms & VM_PROT_WRITE) { /* * In the more complicated case, flush out all * pending changes to the object then turn it * copy-on-write. */ vm_object_reference(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VM_OBJECT_LOCK(object); vm_object_page_clean(object, offset, offset + vme->end - vme->start, OBJPC_SYNC); VM_OBJECT_UNLOCK(object); VOP_UNLOCK(vp, 0); vn_finished_write(mp); vm_object_deallocate(object); /* * Why bother if there's no read permissions * anymore? For the rest, we need to leave * the write permissions on for COW, or * remove them entirely if configured to. */ if (!mac_mmap_revocation_via_cow) { vme->max_protection &= ~VM_PROT_WRITE; vme->protection &= ~VM_PROT_WRITE; } if ((revokeperms & VM_PROT_READ) == 0) vme->eflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; } if (revokeperms & VM_PROT_EXECUTE) { vme->max_protection &= ~VM_PROT_EXECUTE; vme->protection &= ~VM_PROT_EXECUTE; } if (revokeperms & VM_PROT_READ) { vme->max_protection = 0; vme->protection = 0; } pmap_protect(map->pmap, vme->start, vme->end, vme->protection & ~revokeperms); vm_map_simplify_entry(map, vme); } VFS_UNLOCK_GIANT(vfslocked); } vm_map_unlock(map); }
kern_return_t projected_buffer_allocate( vm_map_t map, vm_size_t size, int persistence, vm_offset_t *kernel_p, vm_offset_t *user_p, vm_prot_t protection, vm_inherit_t inheritance) /*Currently only VM_INHERIT_NONE supported*/ { vm_object_t object; vm_map_entry_t u_entry, k_entry; vm_offset_t addr; vm_size_t r_size; kern_return_t kr; if (map == VM_MAP_NULL || map == kernel_map) return(KERN_INVALID_ARGUMENT); /* * Allocate a new object. */ size = round_page(size); object = vm_object_allocate(size); vm_map_lock(kernel_map); kr = vm_map_find_entry(kernel_map, &addr, size, (vm_offset_t) 0, VM_OBJECT_NULL, &k_entry); if (kr != KERN_SUCCESS) { vm_map_unlock(kernel_map); vm_object_deallocate(object); return kr; } k_entry->object.vm_object = object; if (!persistence) k_entry->projected_on = (vm_map_entry_t) -1; /*Mark entry so as to automatically deallocate it when last corresponding user entry is deallocated*/ vm_map_unlock(kernel_map); *kernel_p = addr; vm_map_lock(map); kr = vm_map_find_entry(map, &addr, size, (vm_offset_t) 0, VM_OBJECT_NULL, &u_entry); if (kr != KERN_SUCCESS) { vm_map_unlock(map); vm_map_lock(kernel_map); vm_map_entry_delete(kernel_map, k_entry); vm_map_unlock(kernel_map); vm_object_deallocate(object); return kr; } u_entry->object.vm_object = object; vm_object_reference(object); u_entry->projected_on = k_entry; /*Creates coupling with kernel mapping of the buffer, and also guarantees that user cannot directly manipulate buffer VM entry*/ u_entry->protection = protection; u_entry->max_protection = protection; u_entry->inheritance = inheritance; vm_map_unlock(map); *user_p = addr; /* * Allocate wired-down memory in the object, * and enter it in the kernel pmap. */ kmem_alloc_pages(object, 0, *kernel_p, *kernel_p + size, VM_PROT_READ | VM_PROT_WRITE); memset((void*) *kernel_p, 0, size); /*Zero fill*/ /* Set up physical mappings for user pmap */ pmap_pageable(map->pmap, *user_p, *user_p + size, FALSE); for (r_size = 0; r_size < size; r_size += PAGE_SIZE) { addr = pmap_extract(kernel_pmap, *kernel_p + r_size); pmap_enter(map->pmap, *user_p + r_size, addr, protection, TRUE); } return(KERN_SUCCESS); }
kern_return_t kmem_alloc_aligned( vm_map_t map, vm_offset_t *addrp, vm_size_t size) { vm_map_entry_t entry; vm_offset_t offset; vm_offset_t addr; unsigned int attempts; kern_return_t kr; if ((size & (size - 1)) != 0) panic("kmem_alloc_aligned"); /* * Use the kernel object for wired-down kernel pages. * Assume that no region of the kernel object is * referenced more than once. We want vm_map_find_entry * to extend an existing entry if possible. */ size = round_page(size); attempts = 0; retry: vm_map_lock(map); kr = vm_map_find_entry(map, &addr, size, size - 1, kernel_object, &entry); if (kr != KERN_SUCCESS) { vm_map_unlock(map); if (attempts == 0) { attempts++; slab_collect(); goto retry; } printf_once("no more rooom for kmem_alloc_aligned in %p\n", map); return kr; } /* * Since we didn't know where the new region would * start, we couldn't supply the correct offset into * the kernel object. We only initialize the entry * if we aren't extending an existing entry. */ offset = addr - VM_MIN_KERNEL_ADDRESS; if (entry->object.vm_object == VM_OBJECT_NULL) { vm_object_reference(kernel_object); entry->object.vm_object = kernel_object; entry->offset = offset; } /* * Since we have not given out this address yet, * it is safe to unlock the map. */ vm_map_unlock(map); /* * Allocate wired-down memory in the kernel_object, * for this entry, and enter it in the kernel pmap. */ kmem_alloc_pages(kernel_object, offset, addr, addr + size, VM_PROT_DEFAULT); /* * Return the memory, not zeroed. */ *addrp = addr; return KERN_SUCCESS; }
/* * kmem_realloc: * * Reallocate wired-down memory in the kernel's address map * or a submap. Newly allocated pages are not zeroed. * This can only be used on regions allocated with kmem_alloc. * * If successful, the pages in the old region are mapped twice. * The old region is unchanged. Use kmem_free to get rid of it. */ kern_return_t kmem_realloc( vm_map_t map, vm_offset_t oldaddr, vm_size_t oldsize, vm_offset_t *newaddrp, vm_size_t newsize) { vm_offset_t oldmin, oldmax; vm_offset_t newaddr; vm_object_t object; vm_map_entry_t oldentry, newentry; unsigned int attempts; kern_return_t kr; oldmin = trunc_page(oldaddr); oldmax = round_page(oldaddr + oldsize); oldsize = oldmax - oldmin; newsize = round_page(newsize); /* * Find space for the new region. */ attempts = 0; retry: vm_map_lock(map); kr = vm_map_find_entry(map, &newaddr, newsize, (vm_offset_t) 0, VM_OBJECT_NULL, &newentry); if (kr != KERN_SUCCESS) { vm_map_unlock(map); if (attempts == 0) { attempts++; slab_collect(); goto retry; } printf_once("no more room for kmem_realloc in %p\n", map); return kr; } /* * Find the VM object backing the old region. */ if (!vm_map_lookup_entry(map, oldmin, &oldentry)) panic("kmem_realloc"); object = oldentry->object.vm_object; /* * Increase the size of the object and * fill in the new region. */ vm_object_reference(object); vm_object_lock(object); if (object->size != oldsize) panic("kmem_realloc"); object->size = newsize; vm_object_unlock(object); newentry->object.vm_object = object; newentry->offset = 0; /* * Since we have not given out this address yet, * it is safe to unlock the map. We are trusting * that nobody will play with either region. */ vm_map_unlock(map); /* * Remap the pages in the old region and * allocate more pages for the new region. */ kmem_remap_pages(object, 0, newaddr, newaddr + oldsize, VM_PROT_DEFAULT); kmem_alloc_pages(object, oldsize, newaddr + oldsize, newaddr + newsize, VM_PROT_DEFAULT); *newaddrp = newaddr; return KERN_SUCCESS; }
kern_return_t projected_buffer_map( vm_map_t map, vm_offset_t kernel_addr, vm_size_t size, vm_offset_t *user_p, vm_prot_t protection, vm_inherit_t inheritance) /*Currently only VM_INHERIT_NONE supported*/ { vm_map_entry_t u_entry, k_entry; vm_offset_t physical_addr, user_addr; vm_size_t r_size; kern_return_t kr; /* * Find entry in kernel map */ size = round_page(size); if (map == VM_MAP_NULL || map == kernel_map || !vm_map_lookup_entry(kernel_map, kernel_addr, &k_entry) || kernel_addr + size > k_entry->vme_end) return(KERN_INVALID_ARGUMENT); /* * Create entry in user task */ vm_map_lock(map); kr = vm_map_find_entry(map, &user_addr, size, (vm_offset_t) 0, VM_OBJECT_NULL, &u_entry); if (kr != KERN_SUCCESS) { vm_map_unlock(map); return kr; } u_entry->object.vm_object = k_entry->object.vm_object; vm_object_reference(k_entry->object.vm_object); u_entry->offset = kernel_addr - k_entry->vme_start + k_entry->offset; u_entry->projected_on = k_entry; /*Creates coupling with kernel mapping of the buffer, and also guarantees that user cannot directly manipulate buffer VM entry*/ u_entry->protection = protection; u_entry->max_protection = protection; u_entry->inheritance = inheritance; u_entry->wired_count = k_entry->wired_count; vm_map_unlock(map); *user_p = user_addr; /* Set up physical mappings for user pmap */ pmap_pageable(map->pmap, user_addr, user_addr + size, !k_entry->wired_count); for (r_size = 0; r_size < size; r_size += PAGE_SIZE) { physical_addr = pmap_extract(kernel_pmap, kernel_addr + r_size); pmap_enter(map->pmap, user_addr + r_size, physical_addr, protection, k_entry->wired_count); } return(KERN_SUCCESS); }
int proc_rwmem(struct proc *p, struct uio *uio) { struct vmspace *vm; vm_map_t map; vm_object_t object = NULL; vm_offset_t pageno = 0; /* page number */ vm_prot_t reqprot; vm_offset_t kva; int error, writing; GIANT_REQUIRED; /* * if the vmspace is in the midst of being deallocated or the * process is exiting, don't try to grab anything. The page table * usage in that process can be messed up. */ vm = p->p_vmspace; if ((p->p_flag & P_WEXIT)) return (EFAULT); if (vm->vm_refcnt < 1) return (EFAULT); ++vm->vm_refcnt; /* * The map we want... */ map = &vm->vm_map; writing = uio->uio_rw == UIO_WRITE; reqprot = writing ? (VM_PROT_WRITE | VM_PROT_OVERRIDE_WRITE) : VM_PROT_READ; kva = kmem_alloc_pageable(kernel_map, PAGE_SIZE); /* * Only map in one page at a time. We don't have to, but it * makes things easier. This way is trivial - right? */ do { vm_map_t tmap; vm_offset_t uva; int page_offset; /* offset into page */ vm_map_entry_t out_entry; vm_prot_t out_prot; boolean_t wired; vm_pindex_t pindex; u_int len; vm_page_t m; object = NULL; uva = (vm_offset_t)uio->uio_offset; /* * Get the page number of this segment. */ pageno = trunc_page(uva); page_offset = uva - pageno; /* * How many bytes to copy */ len = min(PAGE_SIZE - page_offset, uio->uio_resid); /* * Fault the page on behalf of the process */ error = vm_fault(map, pageno, reqprot, VM_FAULT_NORMAL); if (error) { error = EFAULT; break; } /* * Now we need to get the page. out_entry, out_prot, wired, * and single_use aren't used. One would think the vm code * would be a *bit* nicer... We use tmap because * vm_map_lookup() can change the map argument. */ tmap = map; error = vm_map_lookup(&tmap, pageno, reqprot, &out_entry, &object, &pindex, &out_prot, &wired); if (error) { error = EFAULT; /* * Make sure that there is no residue in 'object' from * an error return on vm_map_lookup. */ object = NULL; break; } m = vm_page_lookup(object, pindex); /* Allow fallback to backing objects if we are reading */ while (m == NULL && !writing && object->backing_object) { pindex += OFF_TO_IDX(object->backing_object_offset); object = object->backing_object; m = vm_page_lookup(object, pindex); } if (m == NULL) { error = EFAULT; /* * Make sure that there is no residue in 'object' from * an error return on vm_map_lookup. */ object = NULL; vm_map_lookup_done(tmap, out_entry); break; } /* * Wire the page into memory */ vm_page_lock_queues(); vm_page_wire(m); vm_page_unlock_queues(); /* * We're done with tmap now. * But reference the object first, so that we won't loose * it. */ vm_object_reference(object); vm_map_lookup_done(tmap, out_entry); pmap_qenter(kva, &m, 1); /* * Now do the i/o move. */ error = uiomove((caddr_t)(kva + page_offset), len, uio); pmap_qremove(kva, 1); /* * release the page and the object */ vm_page_lock_queues(); vm_page_unwire(m, 1); vm_page_unlock_queues(); vm_object_deallocate(object); object = NULL; } while (error == 0 && uio->uio_resid > 0); if (object) vm_object_deallocate(object); kmem_free(kernel_map, kva, PAGE_SIZE); vmspace_free(vm); return (error); }