void drm_gem_object_release(struct drm_gem_object *obj) { /* * obj->vm_obj can be NULL for private gem objects. */ vm_object_deallocate(obj->vm_obj); }
/* * MPSAFE */ static vm_object_t phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred) { vm_object_t object, object1; vm_pindex_t pindex; /* * Offset should be page aligned. */ if (foff & PAGE_MASK) return (NULL); pindex = OFF_TO_IDX(foff + PAGE_MASK + size); if (handle != NULL) { mtx_lock(&phys_pager_mtx); /* * Look up pager, creating as necessary. */ object1 = NULL; object = vm_pager_object_lookup(&phys_pager_object_list, handle); if (object == NULL) { /* * Allocate object and associate it with the pager. */ mtx_unlock(&phys_pager_mtx); object1 = vm_object_allocate(OBJT_PHYS, pindex); mtx_lock(&phys_pager_mtx); object = vm_pager_object_lookup(&phys_pager_object_list, handle); if (object != NULL) { /* * We raced with other thread while * allocating object. */ if (pindex > object->size) object->size = pindex; } else { object = object1; object1 = NULL; object->handle = handle; TAILQ_INSERT_TAIL(&phys_pager_object_list, object, pager_object_list); } } else { if (pindex > object->size) object->size = pindex; } mtx_unlock(&phys_pager_mtx); vm_object_deallocate(object1); } else { object = vm_object_allocate(OBJT_PHYS, pindex); } return (object); }
static void shm_drop(struct shmfd *shmfd) { if (refcount_release(&shmfd->shm_refs)) { #ifdef MAC mac_posixshm_destroy(shmfd); #endif vm_object_deallocate(shmfd->shm_object); free(shmfd, M_SHMFD); } }
/* * Helper routines to allow the backing object of a shared memory file * descriptor to be mapped in the kernel. */ int shm_map(struct file *fp, size_t size, off_t offset, void **memp) { struct shmfd *shmfd; vm_offset_t kva, ofs; vm_object_t obj; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; obj = shmfd->shm_object; VM_OBJECT_LOCK(obj); /* * XXXRW: This validation is probably insufficient, and subject to * sign errors. It should be fixed. */ if (offset >= shmfd->shm_size || offset + size > round_page(shmfd->shm_size)) { VM_OBJECT_UNLOCK(obj); return (EINVAL); } shmfd->shm_kmappings++; vm_object_reference_locked(obj); VM_OBJECT_UNLOCK(obj); /* Map the object into the kernel_map and wire it. */ kva = vm_map_min(kernel_map); ofs = offset & PAGE_MASK; offset = trunc_page(offset); size = round_page(size + ofs); rv = vm_map_find(kernel_map, obj, offset, &kva, size, VMFS_ALIGNED_SPACE, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0); if (rv == KERN_SUCCESS) { rv = vm_map_wire(kernel_map, kva, kva + size, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (rv == KERN_SUCCESS) { *memp = (void *)(kva + ofs); return (0); } vm_map_remove(kernel_map, kva, kva + size); } else vm_object_deallocate(obj); /* On failure, drop our mapping reference. */ VM_OBJECT_LOCK(obj); shmfd->shm_kmappings--; VM_OBJECT_UNLOCK(obj); return (vm_mmap_to_errno(rv)); }
DECLHIDDEN(int) rtR0MemAllocEx(size_t cb, uint32_t fFlags, PRTMEMHDR *ppHdr) { size_t cbAllocated = cb; PRTMEMHDR pHdr = NULL; #ifdef RT_ARCH_AMD64 /* * Things are a bit more complicated on AMD64 for executable memory * because we need to be in the ~2GB..~0 range for code. */ if (fFlags & RTMEMHDR_FLAG_EXEC) { if (fFlags & RTMEMHDR_FLAG_ANY_CTX) return VERR_NOT_SUPPORTED; # ifdef USE_KMEM_ALLOC_PROT pHdr = (PRTMEMHDR)kmem_alloc_prot(kernel_map, cb + sizeof(*pHdr), VM_PROT_ALL, VM_PROT_ALL, KERNBASE); # else vm_object_t pVmObject = NULL; vm_offset_t Addr = KERNBASE; cbAllocated = RT_ALIGN_Z(cb + sizeof(*pHdr), PAGE_SIZE); pVmObject = vm_object_allocate(OBJT_DEFAULT, cbAllocated >> PAGE_SHIFT); if (!pVmObject) return VERR_NO_EXEC_MEMORY; /* Addr contains a start address vm_map_find will start searching for suitable space at. */ int rc = vm_map_find(kernel_map, pVmObject, 0, &Addr, cbAllocated, TRUE, VM_PROT_ALL, VM_PROT_ALL, 0); if (rc == KERN_SUCCESS) { rc = vm_map_wire(kernel_map, Addr, Addr + cbAllocated, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (rc == KERN_SUCCESS) { pHdr = (PRTMEMHDR)Addr; if (fFlags & RTMEMHDR_FLAG_ZEROED) bzero(pHdr, cbAllocated); } else vm_map_remove(kernel_map, Addr, Addr + cbAllocated); } else vm_object_deallocate(pVmObject); # endif } else #endif {
static int rtR0MemObjFreeBSDAllocPhysPages(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJTYPE enmType, size_t cb, RTHCPHYS PhysHighest, size_t uAlignment, bool fContiguous, int rcNoMem) { uint32_t cPages = atop(cb); vm_paddr_t VmPhysAddrHigh; /* create the object. */ PRTR0MEMOBJFREEBSD pMemFreeBSD = (PRTR0MEMOBJFREEBSD)rtR0MemObjNew(sizeof(*pMemFreeBSD), enmType, NULL, cb); if (!pMemFreeBSD) return VERR_NO_MEMORY; pMemFreeBSD->pObject = vm_object_allocate(OBJT_PHYS, atop(cb)); if (PhysHighest != NIL_RTHCPHYS) VmPhysAddrHigh = PhysHighest; else VmPhysAddrHigh = ~(vm_paddr_t)0; int rc = rtR0MemObjFreeBSDPhysAllocHelper(pMemFreeBSD->pObject, cPages, VmPhysAddrHigh, uAlignment, fContiguous, true, rcNoMem); if (RT_SUCCESS(rc)) { if (fContiguous) { Assert(enmType == RTR0MEMOBJTYPE_PHYS); #if __FreeBSD_version >= 1000030 VM_OBJECT_WLOCK(pMemFreeBSD->pObject); #else VM_OBJECT_LOCK(pMemFreeBSD->pObject); #endif pMemFreeBSD->Core.u.Phys.PhysBase = VM_PAGE_TO_PHYS(vm_page_find_least(pMemFreeBSD->pObject, 0)); #if __FreeBSD_version >= 1000030 VM_OBJECT_WUNLOCK(pMemFreeBSD->pObject); #else VM_OBJECT_UNLOCK(pMemFreeBSD->pObject); #endif pMemFreeBSD->Core.u.Phys.fAllocated = true; } *ppMem = &pMemFreeBSD->Core; } else { vm_object_deallocate(pMemFreeBSD->pObject); rtR0MemObjDelete(&pMemFreeBSD->Core); } return rc; }
vm_object_t vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { int error; vm_object_t obj; struct sglist *sg; sg = sglist_alloc(1, M_WAITOK); error = sglist_append_phys(sg, hpa, len); KASSERT(error == 0, ("error %d appending physaddr to sglist", error)); obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL); if (obj != NULL) { /* * VT-x ignores the MTRR settings when figuring out the * memory type for translations obtained through EPT. * * Therefore we explicitly force the pages provided by * this object to be mapped as uncacheable. */ VM_OBJECT_WLOCK(obj); error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE); VM_OBJECT_WUNLOCK(obj); if (error != KERN_SUCCESS) { panic("vmm_mmio_alloc: vm_object_set_memattr error %d", error); } error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0, VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0); if (error != KERN_SUCCESS) { vm_object_deallocate(obj); obj = NULL; } } /* * Drop the reference on the sglist. * * If the scatter/gather object was successfully allocated then it * has incremented the reference count on the sglist. Dropping the * initial reference count ensures that the sglist will be freed * when the object is deallocated. * * If the object could not be allocated then we end up freeing the * sglist. */ sglist_free(sg); return (obj); }
int ttm_tt_swapin(struct ttm_tt *ttm) { vm_object_t obj; vm_page_t from_page, to_page; int i, ret, rv; obj = ttm->swap_storage; VM_OBJECT_LOCK(obj); vm_object_pip_add(obj, 1); for (i = 0; i < ttm->num_pages; ++i) { from_page = vm_page_grab(obj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (from_page->valid != VM_PAGE_BITS_ALL) { if (vm_pager_has_page(obj, i)) { rv = vm_pager_get_page(obj, &from_page, 1); if (rv != VM_PAGER_OK) { vm_page_free(from_page); ret = -EIO; goto err_ret; } } else { vm_page_zero_invalid(from_page, TRUE); } } to_page = ttm->pages[i]; if (unlikely(to_page == NULL)) { ret = -ENOMEM; vm_page_wakeup(from_page); goto err_ret; } pmap_copy_page(VM_PAGE_TO_PHYS(from_page), VM_PAGE_TO_PHYS(to_page)); vm_page_wakeup(from_page); } vm_object_pip_wakeup(obj); VM_OBJECT_UNLOCK(obj); if (!(ttm->page_flags & TTM_PAGE_FLAG_PERSISTENT_SWAP)) vm_object_deallocate(obj); ttm->swap_storage = NULL; ttm->page_flags &= ~TTM_PAGE_FLAG_SWAPPED; return (0); err_ret: vm_object_pip_wakeup(obj); VM_OBJECT_UNLOCK(obj); return (ret); }
static int rtR0MemObjFreeBSDAllocHelper(PRTR0MEMOBJFREEBSD pMemFreeBSD, bool fExecutable, vm_paddr_t VmPhysAddrHigh, bool fContiguous, int rcNoMem) { vm_offset_t MapAddress = vm_map_min(kernel_map); size_t cPages = atop(pMemFreeBSD->Core.cb); int rc; pMemFreeBSD->pObject = vm_object_allocate(OBJT_PHYS, cPages); /* No additional object reference for auto-deallocation upon unmapping. */ #if __FreeBSD_version >= 1000055 rc = vm_map_find(kernel_map, pMemFreeBSD->pObject, 0, &MapAddress, pMemFreeBSD->Core.cb, 0, VMFS_ANY_SPACE, fExecutable ? VM_PROT_ALL : VM_PROT_RW, VM_PROT_ALL, 0); #else rc = vm_map_find(kernel_map, pMemFreeBSD->pObject, 0, &MapAddress, pMemFreeBSD->Core.cb, VMFS_ANY_SPACE, fExecutable ? VM_PROT_ALL : VM_PROT_RW, VM_PROT_ALL, 0); #endif if (rc == KERN_SUCCESS) { rc = rtR0MemObjFreeBSDPhysAllocHelper(pMemFreeBSD->pObject, cPages, VmPhysAddrHigh, PAGE_SIZE, fContiguous, false, rcNoMem); if (RT_SUCCESS(rc)) { vm_map_wire(kernel_map, MapAddress, MapAddress + pMemFreeBSD->Core.cb, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); /* Store start address */ pMemFreeBSD->Core.pv = (void *)MapAddress; return VINF_SUCCESS; } vm_map_remove(kernel_map, MapAddress, MapAddress + pMemFreeBSD->Core.cb); } else { rc = rcNoMem; /** @todo fix translation (borrow from darwin) */ vm_object_deallocate(pMemFreeBSD->pObject); } rtR0MemObjDelete(&pMemFreeBSD->Core); return rc; }
void efi_destroy_1t1_map(void) { vm_page_t m; if (obj_1t1_pt != NULL) { VM_OBJECT_RLOCK(obj_1t1_pt); TAILQ_FOREACH(m, &obj_1t1_pt->memq, listq) m->wire_count = 0; vm_wire_sub(obj_1t1_pt->resident_page_count); VM_OBJECT_RUNLOCK(obj_1t1_pt); vm_object_deallocate(obj_1t1_pt); } obj_1t1_pt = NULL; efi_pml4 = NULL; efi_pml4_page = NULL; }
static int spigen_close(struct cdev *cdev, int fflag, int devtype, struct thread *td) { device_t dev = cdev->si_drv1; struct spigen_softc *sc = device_get_softc(dev); mtx_lock(&sc->sc_mtx); if (sc->sc_mmap_buffer != NULL) { pmap_qremove(sc->sc_mmap_kvaddr, sc->sc_mmap_buffer_size / PAGE_SIZE); kva_free(sc->sc_mmap_kvaddr, sc->sc_mmap_buffer_size); sc->sc_mmap_kvaddr = 0; vm_object_deallocate(sc->sc_mmap_buffer); sc->sc_mmap_buffer = NULL; sc->sc_mmap_buffer_size = 0; } mtx_unlock(&sc->sc_mtx); return (0); }
struct drm_gem_object * drm_gem_object_alloc(struct drm_device *dev, size_t size) { struct drm_gem_object *obj; obj = malloc(sizeof(*obj), DRM_MEM_DRIVER, M_WAITOK | M_ZERO); if (drm_gem_object_init(dev, obj, size) != 0) goto free; if (dev->driver->gem_init_object != NULL && dev->driver->gem_init_object(obj) != 0) goto dealloc; return (obj); dealloc: vm_object_deallocate(obj->vm_obj); free: free(obj, DRM_MEM_DRIVER); return (NULL); }
void efi_destroy_1t1_map(void) { vm_page_t m; if (obj_1t1_pt != NULL) { VM_OBJECT_RLOCK(obj_1t1_pt); TAILQ_FOREACH(m, &obj_1t1_pt->memq, listq) m->wire_count = 0; atomic_subtract_int(&vm_cnt.v_wire_count, obj_1t1_pt->resident_page_count); VM_OBJECT_RUNLOCK(obj_1t1_pt); vm_object_deallocate(obj_1t1_pt); } obj_1t1_pt = NULL; efi_l0 = NULL; efi_l0_page = NULL; }
static void pscnv_gem_pager_dtor(void *handle) { struct drm_gem_object *gem_obj = handle; struct pscnv_bo *bo = gem_obj->driver_private; struct drm_device *dev = gem_obj->dev; vm_object_t devobj; DRM_LOCK(dev); devobj = cdev_pager_lookup(handle); if (devobj != NULL) { vm_size_t page_count = OFF_TO_IDX(bo->size); vm_page_t m; int i; VM_OBJECT_LOCK(devobj); for (i = 0; i < page_count; i++) { m = vm_page_lookup(devobj, i); if (!m) continue; if (pscnv_mem_debug > 0) NV_WARN(dev, "Freeing %010llx + %08llx (%p\n", bo->start, i * PAGE_SIZE, m); cdev_pager_free_page(devobj, m); } VM_OBJECT_UNLOCK(devobj); vm_object_deallocate(devobj); } else { DRM_UNLOCK(dev); NV_ERROR(dev, "Could not find handle %p bo %p\n", handle, bo); return; } if (pscnv_mem_debug > 0) NV_WARN(dev, "Freed %010llx (%p)\n", bo->start, bo); //kfree(bo->fake_pages); if (bo->chan) pscnv_chan_unref(bo->chan); else drm_gem_object_unreference_unlocked(gem_obj); DRM_UNLOCK(dev); }
void ttm_tt_destroy(struct ttm_tt *ttm) { if (unlikely(ttm == NULL)) return; if (ttm->state == tt_bound) { ttm_tt_unbind(ttm); } if (likely(ttm->pages != NULL)) { ttm->bdev->driver->ttm_tt_unpopulate(ttm); } if (!(ttm->page_flags & TTM_PAGE_FLAG_PERSISTENT_SWAP) && ttm->swap_storage) vm_object_deallocate(ttm->swap_storage); ttm->swap_storage = NULL; ttm->func->destroy(ttm); }
void ttm_bo_release_mmap(struct ttm_buffer_object *bo) { vm_object_t vm_obj; vm_page_t m; int i; vm_obj = cdev_pager_lookup(bo); if (vm_obj == NULL) return; VM_OBJECT_WLOCK(vm_obj); for (i = 0; i < bo->num_pages; i++) { m = vm_page_lookup_busy_wait(vm_obj, i, TRUE, "ttm_unm"); if (m == NULL) continue; cdev_pager_free_page(vm_obj, m); } VM_OBJECT_WUNLOCK(vm_obj); vm_object_deallocate(vm_obj); }
static void unlock_and_deallocate(struct faultstate *fs) { vm_object_pip_wakeup(fs->object); VM_OBJECT_WUNLOCK(fs->object); if (fs->object != fs->first_object) { VM_OBJECT_WLOCK(fs->first_object); vm_page_lock(fs->first_m); vm_page_free(fs->first_m); vm_page_unlock(fs->first_m); vm_object_pip_wakeup(fs->first_object); VM_OBJECT_WUNLOCK(fs->first_object); fs->first_m = NULL; } vm_object_deallocate(fs->first_object); unlock_map(fs); if (fs->vp != NULL) { vput(fs->vp); fs->vp = NULL; } }
static void link_elf_unload_file(linker_file_t file) { elf_file_t ef = file->priv; if (ef) { #ifdef SPARSE_MAPPING if (ef->object) { vm_map_remove(&kernel_map, (vm_offset_t) ef->address, (vm_offset_t) ef->address + (ef->object->size << PAGE_SHIFT)); vm_object_deallocate(ef->object); } #else if (ef->address) kfree(ef->address, M_LINKER); #endif if (ef->symbase) kfree(ef->symbase, M_LINKER); if (ef->strbase) kfree(ef->strbase, M_LINKER); kfree(ef, M_LINKER); } }
vm_object_t vmm_mem_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) { int error; vm_object_t obj; if (gpa & PAGE_MASK) panic("vmm_mem_alloc: invalid gpa %#lx", gpa); if (len == 0 || (len & PAGE_MASK) != 0) panic("vmm_mem_alloc: invalid allocation size %lu", len); obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); if (obj != NULL) { error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != KERN_SUCCESS) { vm_object_deallocate(obj); obj = NULL; } } return (obj); }
/* * Routine: * vm_fault_copy_entry * Function: * Create new shadow object backing dst_entry with private copy of * all underlying pages. When src_entry is equal to dst_entry, * function implements COW for wired-down map entry. Otherwise, * it forks wired entry into dst_map. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, vm_map_entry_t dst_entry, vm_map_entry_t src_entry, vm_ooffset_t *fork_charge) { vm_object_t backing_object, dst_object, object, src_object; vm_pindex_t dst_pindex, pindex, src_pindex; vm_prot_t access, prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; boolean_t src_readonly, upgrade; #ifdef lint src_map++; #endif /* lint */ upgrade = src_entry == dst_entry; src_object = src_entry->object.vm_object; src_pindex = OFF_TO_IDX(src_entry->offset); src_readonly = (src_entry->protection & VM_PROT_WRITE) == 0; /* * Create the top-level object for the destination entry. (Doesn't * actually shadow anything - we copy the pages directly.) */ dst_object = vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(dst_entry->end - dst_entry->start)); #if VM_NRESERVLEVEL > 0 dst_object->flags |= OBJ_COLORED; dst_object->pg_color = atop(dst_entry->start); #endif VM_OBJECT_WLOCK(dst_object); KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; dst_object->charge = dst_entry->end - dst_entry->start; if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, ("vm_fault_copy_entry: leaked swp charge")); dst_object->cred = curthread->td_ucred; crhold(dst_object->cred); *fork_charge += dst_object->charge; } else { dst_object->cred = dst_entry->cred; dst_entry->cred = NULL; } access = prot = dst_entry->protection; /* * If not an upgrade, then enter the mappings in the pmap as * read and/or execute accesses. Otherwise, enter them as * write accesses. * * A writeable large page mapping is only created if all of * the constituent small page mappings are modified. Marking * PTEs as modified on inception allows promotion to happen * without taking potentially large number of soft faults. */ if (!upgrade) access &= ~VM_PROT_WRITE; /* * Loop through all of the virtual pages within the entry's * range, copying each page from the source object to the * destination object. Since the source is wired, those pages * must exist. In contrast, the destination is pageable. * Since the destination object does share any backing storage * with the source object, all of its pages must be dirtied, * regardless of whether they can be written. */ for (vaddr = dst_entry->start, dst_pindex = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_pindex++) { /* * Allocate a page in the destination object. */ do { dst_m = vm_page_alloc(dst_object, dst_pindex, VM_ALLOC_NORMAL); if (dst_m == NULL) { VM_OBJECT_WUNLOCK(dst_object); VM_WAIT; VM_OBJECT_WLOCK(dst_object); } } while (dst_m == NULL); /* * Find the page in the source object, and copy it in. * (Because the source is wired down, the page will be in * memory.) */ VM_OBJECT_WLOCK(src_object); object = src_object; pindex = src_pindex + dst_pindex; while ((src_m = vm_page_lookup(object, pindex)) == NULL && src_readonly && (backing_object = object->backing_object) != NULL) { /* * Allow fallback to backing objects if we are reading. */ VM_OBJECT_WLOCK(backing_object); pindex += OFF_TO_IDX(object->backing_object_offset); VM_OBJECT_WUNLOCK(object); object = backing_object; } if (src_m == NULL) panic("vm_fault_copy_wired: page missing"); pmap_copy_page(src_m, dst_m); VM_OBJECT_WUNLOCK(object); dst_m->valid = VM_PAGE_BITS_ALL; dst_m->dirty = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(dst_object); /* * Enter it in the pmap. If a wired, copy-on-write * mapping is being replaced by a write-enabled * mapping, then wire that new mapping. */ pmap_enter(dst_map->pmap, vaddr, access, dst_m, prot, upgrade); /* * Mark it no longer busy, and put it on the active list. */ VM_OBJECT_WLOCK(dst_object); if (upgrade) { vm_page_lock(src_m); vm_page_unwire(src_m, 0); vm_page_unlock(src_m); vm_page_lock(dst_m); vm_page_wire(dst_m); vm_page_unlock(dst_m); } else { vm_page_lock(dst_m); vm_page_activate(dst_m); vm_page_unlock(dst_m); } vm_page_wakeup(dst_m); } VM_OBJECT_WUNLOCK(dst_object); if (upgrade) { dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY); vm_object_deallocate(src_object); } }
static void os_balloonobject_delete(void) { vm_object_deallocate(global_state.vmobject); }
static int link_elf_obj_load_file(const char *filename, linker_file_t * result) { struct nlookupdata nd; struct thread *td = curthread; /* XXX */ struct proc *p = td->td_proc; char *pathname; struct vnode *vp; Elf_Ehdr *hdr; Elf_Shdr *shdr; Elf_Sym *es; int nbytes, i, j; vm_offset_t mapbase; size_t mapsize; int error = 0; int resid; elf_file_t ef; linker_file_t lf; int symtabindex; int symstrindex; int shstrindex; int nsym; int pb, rl, ra; int alignmask; /* XXX Hack for firmware loading where p == NULL */ if (p == NULL) { p = &proc0; } KKASSERT(p != NULL); if (p->p_ucred == NULL) { kprintf("link_elf_obj_load_file: cannot load '%s' from filesystem" " this early\n", filename); return ENOENT; } shdr = NULL; lf = NULL; mapsize = 0; hdr = NULL; pathname = linker_search_path(filename); if (pathname == NULL) return ENOENT; error = nlookup_init(&nd, pathname, UIO_SYSSPACE, NLC_FOLLOW | NLC_LOCKVP); if (error == 0) error = vn_open(&nd, NULL, FREAD, 0); kfree(pathname, M_LINKER); if (error) { nlookup_done(&nd); return error; } vp = nd.nl_open_vp; nd.nl_open_vp = NULL; nlookup_done(&nd); /* * Read the elf header from the file. */ hdr = kmalloc(sizeof(*hdr), M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, vp, (void *)hdr, sizeof(*hdr), 0, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = ENOEXEC; goto out; } if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { link_elf_obj_error(filename, "Unsupported file layout"); error = ENOEXEC; goto out; } if (hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT) { link_elf_obj_error(filename, "Unsupported file version"); error = ENOEXEC; goto out; } if (hdr->e_type != ET_REL) { error = ENOSYS; goto out; } if (hdr->e_machine != ELF_TARG_MACH) { link_elf_obj_error(filename, "Unsupported machine"); error = ENOEXEC; goto out; } ef = kmalloc(sizeof(struct elf_file), M_LINKER, M_WAITOK | M_ZERO); lf = linker_make_file(filename, ef, &link_elf_obj_file_ops); if (lf == NULL) { kfree(ef, M_LINKER); error = ENOMEM; goto out; } ef->nprogtab = 0; ef->e_shdr = NULL; ef->nreltab = 0; ef->nrelatab = 0; /* Allocate and read in the section header */ nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0 || hdr->e_shentsize != sizeof(Elf_Shdr)) { error = ENOEXEC; goto out; } shdr = kmalloc(nbytes, M_LINKER, M_WAITOK); ef->e_shdr = shdr; error = vn_rdwr(UIO_READ, vp, (caddr_t) shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid) { error = ENOEXEC; goto out; } /* Scan the section header for information and table sizing. */ nsym = 0; symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: ef->nprogtab++; break; case SHT_SYMTAB: nsym++; symtabindex = i; symstrindex = shdr[i].sh_link; break; case SHT_REL: ef->nreltab++; break; case SHT_RELA: ef->nrelatab++; break; case SHT_STRTAB: break; } } if (ef->nprogtab == 0) { link_elf_obj_error(filename, "file has no contents"); error = ENOEXEC; goto out; } if (nsym != 1) { /* Only allow one symbol table for now */ link_elf_obj_error(filename, "file has no valid symbol table"); error = ENOEXEC; goto out; } if (symstrindex < 0 || symstrindex > hdr->e_shnum || shdr[symstrindex].sh_type != SHT_STRTAB) { link_elf_obj_error(filename, "file has invalid symbol strings"); error = ENOEXEC; goto out; } /* Allocate space for tracking the load chunks */ if (ef->nprogtab != 0) ef->progtab = kmalloc(ef->nprogtab * sizeof(*ef->progtab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nreltab != 0) ef->reltab = kmalloc(ef->nreltab * sizeof(*ef->reltab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nrelatab != 0) ef->relatab = kmalloc(ef->nrelatab * sizeof(*ef->relatab), M_LINKER, M_WAITOK | M_ZERO); if ((ef->nprogtab != 0 && ef->progtab == NULL) || (ef->nreltab != 0 && ef->reltab == NULL) || (ef->nrelatab != 0 && ef->relatab == NULL)) { error = ENOMEM; goto out; } if (symtabindex == -1) panic("lost symbol table index"); /* Allocate space for and load the symbol table */ ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym); ef->ddbsymtab = kmalloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, vp, (void *)ef->ddbsymtab, shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } if (symstrindex == -1) panic("lost symbol string index"); /* Allocate space for and load the symbol strings */ ef->ddbstrcnt = shdr[symstrindex].sh_size; ef->ddbstrtab = kmalloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, vp, ef->ddbstrtab, shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } /* Do we have a string table for the section names? */ shstrindex = -1; if (hdr->e_shstrndx != 0 && shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) { shstrindex = hdr->e_shstrndx; ef->shstrcnt = shdr[shstrindex].sh_size; ef->shstrtab = kmalloc(shdr[shstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, vp, ef->shstrtab, shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } } /* Size up code/data(progbits) and bss(nobits). */ alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: alignmask = shdr[i].sh_addralign - 1; mapsize += alignmask; mapsize &= ~alignmask; mapsize += shdr[i].sh_size; break; } } /* * We know how much space we need for the text/data/bss/etc. This * stuff needs to be in a single chunk so that profiling etc can get * the bounds and gdb can associate offsets with modules */ ef->object = vm_object_allocate(OBJT_DEFAULT, round_page(mapsize) >> PAGE_SHIFT); if (ef->object == NULL) { error = ENOMEM; goto out; } vm_object_hold(ef->object); vm_object_reference_locked(ef->object); ef->address = (caddr_t) vm_map_min(&kernel_map); ef->bytes = 0; /* * In order to satisfy x86_64's architectural requirements on the * location of code and data in the kernel's address space, request a * mapping that is above the kernel. * * vkernel64's text+data is outside the managed VM space entirely. */ #if defined(__x86_64__) && defined(_KERNEL_VIRTUAL) error = vkernel_module_memory_alloc(&mapbase, round_page(mapsize)); vm_object_drop(ef->object); #else mapbase = KERNBASE; error = vm_map_find(&kernel_map, ef->object, NULL, 0, &mapbase, round_page(mapsize), PAGE_SIZE, TRUE, VM_MAPTYPE_NORMAL, VM_PROT_ALL, VM_PROT_ALL, FALSE); vm_object_drop(ef->object); if (error) { vm_object_deallocate(ef->object); ef->object = NULL; goto out; } /* Wire the pages */ error = vm_map_wire(&kernel_map, mapbase, mapbase + round_page(mapsize), 0); #endif if (error != KERN_SUCCESS) { error = ENOMEM; goto out; } /* Inform the kld system about the situation */ lf->address = ef->address = (caddr_t) mapbase; lf->size = round_page(mapsize); ef->bytes = mapsize; /* * Now load code/data(progbits), zero bss(nobits), allocate space for * and load relocs */ pb = 0; rl = 0; ra = 0; alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: alignmask = shdr[i].sh_addralign - 1; mapbase += alignmask; mapbase &= ~alignmask; if (ef->shstrtab && shdr[i].sh_name != 0) ef->progtab[pb].name = ef->shstrtab + shdr[i].sh_name; else if (shdr[i].sh_type == SHT_PROGBITS) ef->progtab[pb].name = "<<PROGBITS>>"; else ef->progtab[pb].name = "<<NOBITS>>"; #if 0 if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, "set_pcpu")) ef->progtab[pb].addr = dpcpu_alloc(shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) ef->progtab[pb].addr = vnet_data_alloc(shdr[i].sh_size); #endif else #endif ef->progtab[pb].addr = (void *)(uintptr_t) mapbase; if (ef->progtab[pb].addr == NULL) { error = ENOSPC; goto out; } ef->progtab[pb].size = shdr[i].sh_size; ef->progtab[pb].sec = i; if (shdr[i].sh_type == SHT_PROGBITS) { error = vn_rdwr(UIO_READ, vp, ef->progtab[pb].addr, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } #if 0 /* Initialize the per-cpu or vnet area. */ if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, "set_pcpu")) dpcpu_copy(ef->progtab[pb].addr, shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) vnet_data_copy(ef->progtab[pb].addr, shdr[i].sh_size); #endif #endif } else bzero(ef->progtab[pb].addr, shdr[i].sh_size); /* Update all symbol values with the offset. */ for (j = 0; j < ef->ddbsymcnt; j++) { es = &ef->ddbsymtab[j]; if (es->st_shndx != i) continue; es->st_value += (Elf_Addr) ef->progtab[pb].addr; } mapbase += shdr[i].sh_size; pb++; break; case SHT_REL: ef->reltab[rl].rel = kmalloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel); ef->reltab[rl].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, vp, (void *)ef->reltab[rl].rel, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } rl++; break; case SHT_RELA: ef->relatab[ra].rela = kmalloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela); ef->relatab[ra].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, vp, (void *)ef->relatab[ra].rela, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid); if (error) goto out; if (resid != 0) { error = EINVAL; goto out; } ra++; break; } } if (pb != ef->nprogtab) panic("lost progbits"); if (rl != ef->nreltab) panic("lost reltab"); if (ra != ef->nrelatab) panic("lost relatab"); if (mapbase != (vm_offset_t) ef->address + mapsize) panic("mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)", mapbase, ef->address, mapsize, (vm_offset_t) ef->address + mapsize); /* Local intra-module relocations */ link_elf_obj_reloc_local(lf); /* Pull in dependencies */ error = linker_load_dependencies(lf); if (error) goto out; /* External relocations */ error = relocate_file(lf); if (error) goto out; *result = lf; out: if (error && lf) linker_file_unload(lf /*, LINKER_UNLOAD_FORCE */); if (hdr) kfree(hdr, M_LINKER); vn_unlock(vp); vn_close(vp, FREAD, NULL); return error; }
static void link_elf_obj_unload_file(linker_file_t file) { elf_file_t ef = file->priv; int i; if (ef->progtab) { for (i = 0; i < ef->nprogtab; i++) { if (ef->progtab[i].size == 0) continue; if (ef->progtab[i].name == NULL) continue; #if 0 if (!strcmp(ef->progtab[i].name, "set_pcpu")) dpcpu_free(ef->progtab[i].addr, ef->progtab[i].size); #ifdef VIMAGE else if (!strcmp(ef->progtab[i].name, VNET_SETNAME)) vnet_data_free(ef->progtab[i].addr, ef->progtab[i].size); #endif #endif } } if (ef->preloaded) { if (ef->reltab) kfree(ef->reltab, M_LINKER); if (ef->relatab) kfree(ef->relatab, M_LINKER); if (ef->progtab) kfree(ef->progtab, M_LINKER); if (ef->ctftab) kfree(ef->ctftab, M_LINKER); if (ef->ctfoff) kfree(ef->ctfoff, M_LINKER); if (ef->typoff) kfree(ef->typoff, M_LINKER); if (file->filename != NULL) preload_delete_name(file->filename); kfree(ef, M_LINKER); /* XXX reclaim module memory? */ return; } for (i = 0; i < ef->nreltab; i++) if (ef->reltab[i].rel) kfree(ef->reltab[i].rel, M_LINKER); for (i = 0; i < ef->nrelatab; i++) if (ef->relatab[i].rela) kfree(ef->relatab[i].rela, M_LINKER); if (ef->reltab) kfree(ef->reltab, M_LINKER); if (ef->relatab) kfree(ef->relatab, M_LINKER); if (ef->progtab) kfree(ef->progtab, M_LINKER); if (ef->object) { #if defined(__x86_64__) && defined(_KERNEL_VIRTUAL) vkernel_module_memory_free((vm_offset_t)ef->address, ef->bytes); #else vm_map_remove(&kernel_map, (vm_offset_t) ef->address, (vm_offset_t) ef->address + (ef->object->size << PAGE_SHIFT)); #endif vm_object_deallocate(ef->object); ef->object = NULL; } if (ef->e_shdr) kfree(ef->e_shdr, M_LINKER); if (ef->ddbsymtab) kfree(ef->ddbsymtab, M_LINKER); if (ef->ddbstrtab) kfree(ef->ddbstrtab, M_LINKER); if (ef->shstrtab) kfree(ef->shstrtab, M_LINKER); if (ef->ctftab) kfree(ef->ctftab, M_LINKER); if (ef->ctfoff) kfree(ef->ctfoff, M_LINKER); if (ef->typoff) kfree(ef->typoff, M_LINKER); kfree(ef, M_LINKER); }
vm_object_t cdev_pager_allocate(void *handle, enum obj_type tp, struct cdev_pager_ops *ops, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred) { vm_object_t object, object1; vm_pindex_t pindex; u_short color; if (tp != OBJT_DEVICE && tp != OBJT_MGTDEVICE) return (NULL); /* * Offset should be page aligned. */ if (foff & PAGE_MASK) return (NULL); size = round_page(size); pindex = OFF_TO_IDX(foff + size); if (ops->cdev_pg_ctor(handle, size, prot, foff, cred, &color) != 0) return (NULL); mtx_lock(&dev_pager_mtx); /* * Look up pager, creating as necessary. */ object1 = NULL; object = vm_pager_object_lookup(&dev_pager_object_list, handle); if (object == NULL) { /* * Allocate object and associate it with the pager. Initialize * the object's pg_color based upon the physical address of the * device's memory. */ mtx_unlock(&dev_pager_mtx); object1 = vm_object_allocate(tp, pindex); object1->flags |= OBJ_COLORED; object1->pg_color = color; object1->handle = handle; object1->un_pager.devp.ops = ops; object1->un_pager.devp.dev = handle; TAILQ_INIT(&object1->un_pager.devp.devp_pglist); mtx_lock(&dev_pager_mtx); object = vm_pager_object_lookup(&dev_pager_object_list, handle); if (object != NULL) { /* * We raced with other thread while allocating object. */ if (pindex > object->size) object->size = pindex; } else { object = object1; object1 = NULL; object->handle = handle; TAILQ_INSERT_TAIL(&dev_pager_object_list, object, pager_object_list); KASSERT(object->type == tp, ("Inconsistent device pager type %p %d", object, tp)); } } else { if (pindex > object->size) object->size = pindex; } mtx_unlock(&dev_pager_mtx); if (object1 != NULL) { object1->handle = object1; mtx_lock(&dev_pager_mtx); TAILQ_INSERT_TAIL(&dev_pager_object_list, object1, pager_object_list); mtx_unlock(&dev_pager_mtx); vm_object_deallocate(object1); } return (object); }
int acpi_sleep_machdep(struct acpi_softc *sc, int state) { ACPI_STATUS status; vm_offset_t oldphys; struct pmap *pm; vm_page_t page; static vm_page_t opage = NULL; int ret = 0; int pteobj_allocated = 0; u_long ef; struct proc *p; if (sc->acpi_wakeaddr == 0) { return (0); } AcpiSetFirmwareWakingVector(sc->acpi_wakephys); ef = read_eflags(); disable_intr(); /* Create Identity Mapping */ if ((p = curproc) == NULL) p = &proc0; pm = vmspace_pmap(p->p_vmspace); if (pm->pm_pteobj == NULL) { pm->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, PTDPTDI + 1); pteobj_allocated = 1; } oldphys = pmap_extract(pm, sc->acpi_wakephys); if (oldphys) { opage = PHYS_TO_VM_PAGE(oldphys); } page = PHYS_TO_VM_PAGE(sc->acpi_wakephys); pmap_enter(pm, sc->acpi_wakephys, page, VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE, 1); ret_addr = 0; if (acpi_savecpu()) { /* Execute Sleep */ p_gdt = (struct region_descriptor *)(sc->acpi_wakeaddr + physical_gdt); p_gdt->rd_limit = r_gdt.rd_limit; p_gdt->rd_base = vtophys(r_gdt.rd_base); WAKECODE_FIXUP(physical_esp, u_int32_t, vtophys(r_esp)); WAKECODE_FIXUP(previous_cr0, u_int32_t, r_cr0); WAKECODE_FIXUP(previous_cr2, u_int32_t, r_cr2); WAKECODE_FIXUP(previous_cr3, u_int32_t, r_cr3); WAKECODE_FIXUP(previous_cr4, u_int32_t, r_cr4); WAKECODE_FIXUP(previous_tr, u_int16_t, r_tr); WAKECODE_BCOPY(previous_gdt, struct region_descriptor, r_gdt); WAKECODE_FIXUP(previous_ldt, u_int16_t, r_ldt); WAKECODE_BCOPY(previous_idt, struct region_descriptor, r_idt); WAKECODE_FIXUP(where_to_recover, void, acpi_restorecpu); WAKECODE_FIXUP(previous_ds, u_int16_t, r_ds); WAKECODE_FIXUP(previous_es, u_int16_t, r_es); WAKECODE_FIXUP(previous_fs, u_int16_t, r_fs); WAKECODE_FIXUP(previous_gs, u_int16_t, r_gs); WAKECODE_FIXUP(previous_ss, u_int16_t, r_ss); if (acpi_get_verbose(sc)) { acpi_printcpu(); } wbinvd(); if (state == ACPI_STATE_S4 && sc->acpi_s4bios) { status = AcpiEnterSleepStateS4Bios(); } else { status = AcpiEnterSleepState(state); } if (status != AE_OK) { device_printf(sc->acpi_dev, "AcpiEnterSleepState failed - %s\n", AcpiFormatException(status)); ret = -1; goto out; } for (;;) ; } else { /* Execute Wakeup */ #if 0 initializecpu(); #endif icu_reinit(); if (acpi_get_verbose(sc)) { acpi_savecpu(); acpi_printcpu(); } } out: vm_page_lock_queues(); pmap_remove(pm, sc->acpi_wakephys, sc->acpi_wakephys + PAGE_SIZE); vm_page_unlock_queues(); if (opage) { pmap_enter(pm, sc->acpi_wakephys, page, VM_PROT_READ | VM_PROT_WRITE, 0); } if (pteobj_allocated) { vm_object_deallocate(pm->pm_pteobj); pm->pm_pteobj = NULL; } write_eflags(ef); return (ret); }
/* * Internal version of mmap. * Currently used by mmap, exec, and sys5 shared memory. * Handle is either a vnode pointer or NULL for MAP_ANON. * * No requirements */ int vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, void *handle, vm_ooffset_t foff) { boolean_t fitit; vm_object_t object; vm_offset_t eaddr; vm_size_t esize; vm_size_t align; int (*uksmap)(cdev_t dev, vm_page_t fake); struct vnode *vp; struct thread *td = curthread; struct proc *p; int rv = KERN_SUCCESS; off_t objsize; int docow; int error; if (size == 0) return (0); objsize = round_page(size); if (objsize < size) return (EINVAL); size = objsize; lwkt_gettoken(&map->token); /* * XXX messy code, fixme * * NOTE: Overflow checks require discrete statements or GCC4 * will optimize it out. */ if ((p = curproc) != NULL && map == &p->p_vmspace->vm_map) { esize = map->size + size; /* workaround gcc4 opt */ if (esize < map->size || esize > p->p_rlimit[RLIMIT_VMEM].rlim_cur) { lwkt_reltoken(&map->token); return(ENOMEM); } } /* * We currently can only deal with page aligned file offsets. * The check is here rather than in the syscall because the * kernel calls this function internally for other mmaping * operations (such as in exec) and non-aligned offsets will * cause pmap inconsistencies...so we want to be sure to * disallow this in all cases. * * NOTE: Overflow checks require discrete statements or GCC4 * will optimize it out. */ if (foff & PAGE_MASK) { lwkt_reltoken(&map->token); return (EINVAL); } /* * Handle alignment. For large memory maps it is possible * that the MMU can optimize the page table so align anything * that is a multiple of SEG_SIZE to SEG_SIZE. * * Also align any large mapping (bigger than 16x SG_SIZE) to a * SEG_SIZE address boundary. */ if (flags & MAP_SIZEALIGN) { align = size; if ((align ^ (align - 1)) != (align << 1) - 1) { lwkt_reltoken(&map->token); return (EINVAL); } } else if ((flags & MAP_FIXED) == 0 && ((size & SEG_MASK) == 0 || size > SEG_SIZE * 16)) { align = SEG_SIZE; } else { align = PAGE_SIZE; } if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) { fitit = TRUE; *addr = round_page(*addr); } else { if (*addr != trunc_page(*addr)) { lwkt_reltoken(&map->token); return (EINVAL); } eaddr = *addr + size; if (eaddr < *addr) { lwkt_reltoken(&map->token); return (EINVAL); } fitit = FALSE; if ((flags & MAP_TRYFIXED) == 0) vm_map_remove(map, *addr, *addr + size); } uksmap = NULL; /* * Lookup/allocate object. */ if (flags & MAP_ANON) { /* * Unnamed anonymous regions always start at 0. */ if (handle) { /* * Default memory object */ object = default_pager_alloc(handle, objsize, prot, foff); if (object == NULL) { lwkt_reltoken(&map->token); return(ENOMEM); } docow = MAP_PREFAULT_PARTIAL; } else { /* * Implicit single instance of a default memory * object, so we don't need a VM object yet. */ foff = 0; object = NULL; docow = 0; } vp = NULL; } else { vp = (struct vnode *)handle; /* * Non-anonymous mappings of VCHR (aka not /dev/zero) * cannot specify MAP_STACK or MAP_VPAGETABLE. */ if (vp->v_type == VCHR) { if (flags & (MAP_STACK | MAP_VPAGETABLE)) { lwkt_reltoken(&map->token); return(EINVAL); } } if (vp->v_type == VCHR && vp->v_rdev->si_ops->d_uksmap) { /* * Device mappings without a VM object, typically * sharing permanently allocated kernel memory or * process-context-specific (per-process) data. * * Force them to be shared. */ uksmap = vp->v_rdev->si_ops->d_uksmap; object = NULL; docow = MAP_PREFAULT_PARTIAL; flags &= ~(MAP_PRIVATE|MAP_COPY); flags |= MAP_SHARED; } else if (vp->v_type == VCHR) { /* * Device mappings (device size unknown?). * Force them to be shared. */ error = dev_dmmap_single(vp->v_rdev, &foff, objsize, &object, prot, NULL); if (error == ENODEV) { handle = (void *)(intptr_t)vp->v_rdev; object = dev_pager_alloc(handle, objsize, prot, foff); if (object == NULL) { lwkt_reltoken(&map->token); return(EINVAL); } } else if (error) { lwkt_reltoken(&map->token); return(error); } docow = MAP_PREFAULT_PARTIAL; flags &= ~(MAP_PRIVATE|MAP_COPY); flags |= MAP_SHARED; } else { /* * Regular file mapping (typically). The attribute * check is for the link count test only. mmapable * vnodes must already have a VM object assigned. */ struct vattr vat; int error; error = VOP_GETATTR(vp, &vat); if (error) { lwkt_reltoken(&map->token); return (error); } docow = MAP_PREFAULT_PARTIAL; object = vnode_pager_reference(vp); if (object == NULL && vp->v_type == VREG) { lwkt_reltoken(&map->token); kprintf("Warning: cannot mmap vnode %p, no " "object\n", vp); return(EINVAL); } /* * If it is a regular file without any references * we do not need to sync it. */ if (vp->v_type == VREG && vat.va_nlink == 0) { flags |= MAP_NOSYNC; } } } /* * Deal with the adjusted flags */ if ((flags & (MAP_ANON|MAP_SHARED)) == 0) docow |= MAP_COPY_ON_WRITE; if (flags & MAP_NOSYNC) docow |= MAP_DISABLE_SYNCER; if (flags & MAP_NOCORE) docow |= MAP_DISABLE_COREDUMP; #if defined(VM_PROT_READ_IS_EXEC) if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; if (maxprot & VM_PROT_READ) maxprot |= VM_PROT_EXECUTE; #endif /* * This may place the area in its own page directory if (size) is * large enough, otherwise it typically returns its argument. * * (object can be NULL) */ if (fitit) { *addr = pmap_addr_hint(object, *addr, size); } /* * Stack mappings need special attention. * * Mappings that use virtual page tables will default to storing * the page table at offset 0. */ if (uksmap) { rv = vm_map_find(map, uksmap, vp->v_rdev, foff, addr, size, align, fitit, VM_MAPTYPE_UKSMAP, prot, maxprot, docow); } else if (flags & MAP_STACK) { rv = vm_map_stack(map, *addr, size, flags, prot, maxprot, docow); } else if (flags & MAP_VPAGETABLE) { rv = vm_map_find(map, object, NULL, foff, addr, size, align, fitit, VM_MAPTYPE_VPAGETABLE, prot, maxprot, docow); } else { rv = vm_map_find(map, object, NULL, foff, addr, size, align, fitit, VM_MAPTYPE_NORMAL, prot, maxprot, docow); } if (rv != KERN_SUCCESS) { /* * Lose the object reference. Will destroy the * object if it's an unnamed anonymous mapping * or named anonymous without other references. * * (NOTE: object can be NULL) */ vm_object_deallocate(object); goto out; } /* * Shared memory is also shared with children. */ if (flags & (MAP_SHARED|MAP_INHERIT)) { rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); if (rv != KERN_SUCCESS) { vm_map_remove(map, *addr, *addr + size); goto out; } } /* If a process has marked all future mappings for wiring, do so */ if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE)) vm_map_unwire(map, *addr, *addr + size, FALSE); /* * Set the access time on the vnode */ if (vp != NULL) vn_mark_atime(vp, td); out: lwkt_reltoken(&map->token); switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: return (ENOMEM); case KERN_PROTECTION_FAILURE: return (EACCES); default: return (EINVAL); } }
int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { vm_prot_t prot; long ahead, behind; int alloc_req, era, faultcount, nera, reqpage, result; boolean_t growstack, is_first_object_locked, wired; int map_generation; vm_object_t next_object; vm_page_t marray[VM_FAULT_READ_MAX]; int hardfault; struct faultstate fs; struct vnode *vp; int locked, error; hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); fs.vp = NULL; faultcount = reqpage = 0; RetryFault:; /* * Find the backing store object and offset into it to begin the * search. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { if (growstack && result == KERN_INVALID_ADDRESS && map != kernel_map) { result = vm_map_growstack(curproc, vaddr); if (result != KERN_SUCCESS) return (KERN_FAILURE); growstack = FALSE; goto RetryFault; } return (result); } map_generation = fs.map->timestamp; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { panic("vm_fault: fault on nofault entry, addr: %lx", (u_long)vaddr); } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. This must be done after * obtaining the vnode lock in order to avoid possible deadlocks. */ VM_OBJECT_WLOCK(fs.first_object); vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.lookup_still_valid = TRUE; if (wired) fault_type = prot | (fault_type & VM_PROT_COPY); fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { /* * If the object is dead, we stop here */ if (fs.object->flags & OBJ_DEAD) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { /* * check for page-based copy on write. * We check fs.object == fs.first_object so * as to ensure the legacy COW mechanism is * used when the page in question is part of * a shadow object. Otherwise, vm_page_cowfault() * removes the page from the backing object, * which is not what we want. */ vm_page_lock(fs.m); if ((fs.m->cow) && (fault_type & VM_PROT_WRITE) && (fs.object == fs.first_object)) { vm_page_cowfault(fs.m); unlock_and_deallocate(&fs); goto RetryFault; } /* * Wait/Retry if the page is busy. We have to do this * if the page is busy via either VPO_BUSY or * vm_page_t->busy because the vm_pager may be using * vm_page_t->busy for pageouts ( and even pageins if * it is the vnode pager ), and we could end up trying * to pagein and pageout the same page simultaneously. * * We can theoretically allow the busy case on a read * fault if the page is marked valid, but since such * pages are typically already pmap'd, putting that * special case in might be more effort then it is * worth. We cannot under any circumstances mess * around with a vm_page_t->busy page except, perhaps, * to pmap it. */ if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs.m, PGA_REFERENCED); vm_page_unlock(fs.m); if (fs.object != fs.first_object) { if (!VM_OBJECT_TRYWLOCK( fs.first_object)) { VM_OBJECT_WUNLOCK(fs.object); VM_OBJECT_WLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.object); } vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); vm_object_pip_wakeup(fs.first_object); VM_OBJECT_WUNLOCK(fs.first_object); fs.first_m = NULL; } unlock_map(&fs); if (fs.m == vm_page_lookup(fs.object, fs.pindex)) { vm_page_sleep_if_busy(fs.m, TRUE, "vmpfw"); } vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); PCPU_INC(cnt.v_intrans); vm_object_deallocate(fs.first_object); goto RetryFault; } vm_page_remque(fs.m); vm_page_unlock(fs.m); /* * Mark page busy for other processes, and the * pagedaemon. If it still isn't completely valid * (readable), jump to readrest, else break-out ( we * found the page ). */ vm_page_busy(fs.m); if (fs.m->valid != VM_PAGE_BITS_ALL) goto readrest; break; } /* * Page is not resident, If this is the search termination * or the pager might contain the page, allocate a new page. */ if (TRYPAGER || fs.object == fs.first_object) { if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At * worst, the P_KILLED might be not observed * there, and allocation can fail, causing * restart and new reading of the p_flag. */ fs.m = NULL; if (!vm_page_count_severe() || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 if ((fs.object->flags & OBJ_COLORED) == 0) { fs.object->flags |= OBJ_COLORED; fs.object->pg_color = atop(vaddr) - fs.pindex; } #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs.object->type != OBJT_VNODE && fs.object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; fs.m = vm_page_alloc(fs.object, fs.pindex, alloc_req); } if (fs.m == NULL) { unlock_and_deallocate(&fs); VM_WAITPFAULT; goto RetryFault; } else if (fs.m->valid == VM_PAGE_BITS_ALL) break; } readrest: /* * We have found a valid page or we have allocated a new page. * The page thus may not be valid or may not be entirely * valid. * * Attempt to fault-in the page if there is a chance that the * pager has it, and potentially fault in additional pages * at the same time. */ if (TRYPAGER) { int rv; u_char behavior = vm_map_entry_behavior(fs.entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { behind = 0; ahead = atop(fs.entry->end - vaddr) - 1; if (ahead > VM_FAULT_READ_AHEAD_MAX) ahead = VM_FAULT_READ_AHEAD_MAX; if (fs.pindex == fs.entry->next_read) vm_fault_cache_behind(&fs, VM_FAULT_READ_MAX); } else { /* * If this is a sequential page fault, then * arithmetically increase the number of pages * in the read-ahead window. Otherwise, reset * the read-ahead window to its smallest size. */ behind = atop(vaddr - fs.entry->start); if (behind > VM_FAULT_READ_BEHIND) behind = VM_FAULT_READ_BEHIND; ahead = atop(fs.entry->end - vaddr) - 1; era = fs.entry->read_ahead; if (fs.pindex == fs.entry->next_read) { nera = era + behind; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; behind = 0; if (ahead > nera) ahead = nera; if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_cache_behind(&fs, VM_FAULT_CACHE_BEHIND); } else if (ahead > VM_FAULT_READ_AHEAD_MIN) ahead = VM_FAULT_READ_AHEAD_MIN; if (era != ahead) fs.entry->read_ahead = ahead; } /* * Call the pager to retrieve the data, if any, after * releasing the lock on the map. We hold a ref on * fs.object and the pages are VPO_BUSY'd. */ unlock_map(&fs); if (fs.object->type == OBJT_VNODE) { vp = fs.object->handle; if (vp == fs.vp) goto vnode_locked; else if (fs.vp != NULL) { vput(fs.vp); fs.vp = NULL; } locked = VOP_ISLOCKED(vp); if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* Do not sleep for vnode lock while fs.m is busy */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error != 0) { vhold(vp); release_page(&fs); unlock_and_deallocate(&fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); vdrop(vp); fs.vp = vp; KASSERT(error == 0, ("vm_fault: vget failed")); goto RetryFault; } fs.vp = vp; } vnode_locked: KASSERT(fs.vp == NULL || !fs.map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * now we find out if any other pages should be paged * in at this time this routine checks to see if the * pages surrounding this fault reside in the same * object as the page for this fault. If they do, * then they are faulted in also into the object. The * array "marray" returned contains an array of * vm_page_t structs where one of them is the * vm_page_t passed to the routine. The reqpage * return value is the index into the marray for the * vm_page_t passed to the routine. * * fs.m plus the additional pages are VPO_BUSY'd. */ faultcount = vm_fault_additional_pages( fs.m, behind, ahead, marray, &reqpage); rv = faultcount ? vm_pager_get_pages(fs.object, marray, faultcount, reqpage) : VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* * Found the page. Leave it busy while we play * with it. */ /* * Relookup in case pager changed page. Pager * is responsible for disposition of old page * if moved. */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (!fs.m) { unlock_and_deallocate(&fs); goto RetryFault; } hardfault++; break; /* break to PAGE HAS BEEN FOUND */ } /* * Remove the bogus page (which does not exist at this * object/offset); before doing so, we must get back * our object lock to preserve our invariant. * * Also wake up any other process that may want to bring * in this page. * * If this is the top-level object, we must leave the * busy page to prevent another process from rushing * past us, and inserting the page in that object at * the same time that we are. */ if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * Data outside the range of the pager or an I/O error */ /* * XXX - the check for kernel_map is a kludge to work * around having the machine panic on a kernel space * fault w/ I/O error. */ if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } if (fs.object != fs.first_object) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; /* * XXX - we cannot just fall out at this * point, m has been freed and is invalid! */ } } /* * We get here if the object has default pager (or unwiring) * or the pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); next_object = fs.object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; VM_OBJECT_WLOCK(fs.object); } fs.first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs.m->flags & PG_ZERO) == 0) { pmap_zero_page(fs.m); } else { PCPU_INC(cnt.v_ozfod); } PCPU_INC(cnt.v_zfod); fs.m->valid = VM_PAGE_BITS_ALL; break; /* break to PAGE HAS BEEN FOUND */ } else { KASSERT(fs.object != next_object, ("object loop %p", next_object)); VM_OBJECT_WLOCK(next_object); vm_object_pip_add(next_object, 1); if (fs.object != fs.first_object) vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = next_object; } } KASSERT((fs.m->oflags & VPO_BUSY) != 0, ("vm_fault: not busy after main loop")); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { /* * This allows pages to be virtually copied from a * backing_object into the first_object, where the * backing object has no other refs to it, and cannot * gain any more refs. Instead of a bcopy, we just * move the page from the backing object to the * first object. Note that we must mark the page * dirty in the first object so that it will go out * to swap when needed. */ is_first_object_locked = FALSE; if ( /* * Only one shadow object */ (fs.object->shadow_count == 1) && /* * No COW refs, except us */ (fs.object->ref_count == 1) && /* * No one else can look this object up */ (fs.object->handle == NULL) && /* * No other ways to look the object up */ ((fs.object->type == OBJT_DEFAULT) || (fs.object->type == OBJT_SWAP)) && (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) && /* * We don't chase down the shadow chain */ fs.object == fs.first_object->backing_object) { /* * get rid of the unnecessary page */ vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); /* * grab the page and put it into the * process'es object. The page is * automatically made dirty. */ vm_page_lock(fs.m); vm_page_rename(fs.m, fs.first_object, fs.first_pindex); vm_page_unlock(fs.m); vm_page_busy(fs.m); fs.first_m = fs.m; fs.m = NULL; PCPU_INC(cnt.v_cow_optim); } else { /* * Oh, well, lets copy it. */ pmap_copy_page(fs.m, fs.first_m); fs.first_m->valid = VM_PAGE_BITS_ALL; if (wired && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) { vm_page_lock(fs.first_m); vm_page_wire(fs.first_m); vm_page_unlock(fs.first_m); vm_page_lock(fs.m); vm_page_unwire(fs.m, FALSE); vm_page_unlock(fs.m); } /* * We no longer need the old page or object. */ release_page(&fs); } /* * fs.object != fs.first_object due to above * conditional */ vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); /* * Only use the new page below... */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; if (!is_first_object_locked) VM_OBJECT_WLOCK(fs.object); PCPU_INC(cnt.v_cow_faults); curthread->td_cow++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } fs.lookup_still_valid = TRUE; if (fs.map->timestamp != map_generation) { result = vm_map_lookup_locked(&fs.map, vaddr, fault_type, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); /* * If we don't need the page any longer, put it on the inactive * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { release_page(&fs); unlock_and_deallocate(&fs); /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) goto RetryFault; return (result); } if ((retry_object != fs.first_object) || (retry_pindex != fs.first_pindex)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; } } /* * If the page was filled by a pager, update the map entry's * last read offset. Since the pager does not return the * actual set of pages that it read, this update is based on * the requested set. Typically, the requested and actual * sets are the same. * * XXX The following assignment modifies the map * without holding a write lock on it. */ if (hardfault) fs.entry->next_read = fs.pindex + faultcount - reqpage; if ((prot & VM_PROT_WRITE) != 0 || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_object_set_writeable_dirty(fs.object); /* * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if (fs.entry->eflags & MAP_ENTRY_NOSYNC) { if (fs.m->dirty == 0) fs.m->oflags |= VPO_NOSYNC; } else { fs.m->oflags &= ~VPO_NOSYNC; } /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also tell the backing pager, if any, that it should remove * any swap backing since the page is now dirty. */ if (((fault_type & VM_PROT_WRITE) != 0 && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_page_dirty(fs.m); vm_pager_page_unswapped(fs.m); } } /* * Page had better still be busy */ KASSERT(fs.m->oflags & VPO_BUSY, ("vm_fault: page %p not busy!", fs.m)); /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ KASSERT(fs.m->valid == VM_PAGE_BITS_ALL, ("vm_fault: page %p partially invalid", fs.m)); VM_OBJECT_WUNLOCK(fs.object); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired); if ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 && wired == 0) vm_fault_prefault(fs.map->pmap, vaddr, fs.entry); VM_OBJECT_WLOCK(fs.object); vm_page_lock(fs.m); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if (fault_flags & VM_FAULT_CHANGE_WIRING) { if (wired) vm_page_wire(fs.m); else vm_page_unwire(fs.m, 1); } else vm_page_activate(fs.m); if (m_hold != NULL) { *m_hold = fs.m; vm_page_hold(fs.m); } vm_page_unlock(fs.m); vm_page_wakeup(fs.m); /* * Unlock everything, and return */ unlock_and_deallocate(&fs); if (hardfault) { PCPU_INC(cnt.v_io_faults); curthread->td_ru.ru_majflt++; } else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); }
/* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p) { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *oldcred; struct uidinfo *euip = NULL; register_t *stack_base; int error, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts = NULL, *newsigacts = NULL; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *oldtextvp = NULL, *newtextvp; int credential_changing; int textset; #ifdef MAC struct label *interpvplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ bzero(imgp, sizeof(*imgp)); imgp->proc = p; imgp->attr = &attr; imgp->args = args; oldcred = p->p_ucred; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif /* * Translate the file name. namei() returns a vnode pointer * in ni_vp among other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ if (args->fname != NULL) { NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); } SDT_PROBE1(proc, , , exec, args->fname); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif error = namei(&nd); if (error) goto exec_fail; newtextvp = nd.ni_vp; imgp->vp = newtextvp; } else { AUDIT_ARG_FD(args->fd); /* * Descriptors opened only with O_EXEC or O_RDONLY are allowed. */ error = fgetvp_exec(td, args->fd, &cap_fexecve_rights, &newtextvp); if (error) goto exec_fail; vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = VOP_IS_TEXT(imgp->vp); VOP_SET_TEXT(imgp->vp); error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; imgp->proc->p_fctl0 = 0; /* * Implement image setuid/setgid. * * Determine new credentials before attempting image activators * so that it can be used by process_exec handlers to determine * credential/setid changes. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in capability mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = 0; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp); credential_changing |= will_transition; #endif /* Don't inherit PROC_PDEATHSIG_CTL value if setuid/setgid. */ if (credential_changing) imgp->proc->p_pdeathsig = 0; if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { imgp->credential_setid = true; VOP_UNLOCK(imgp->vp, 0); imgp->newcred = crdup(oldcred); if (attr.va_mode & S_ISUID) { euip = uifind(attr.va_uid); change_euid(imgp->newcred, euip); } vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (attr.va_mode & S_ISGID) change_egid(imgp->newcred, attr.va_gid); /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } else { /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { VOP_UNLOCK(imgp->vp, 0); imgp->newcred = crdup(oldcred); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } } /* The new credentials are installed into the process later. */ /* * Do the best to calculate the full path to the image file. */ if (args->fname != NULL && args->fname[0] == '/') imgp->execpath = args->fname; else { VOP_UNLOCK(imgp->vp, 0); if (vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0) imgp->execpath = args->fname; vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); } /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) VOP_UNSET_TEXT(imgp->vp); error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ VOP_UNSET_TEXT(imgp->vp); /* free name buffer and old vnode */ if (args->fname != NULL) NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC mac_execve_interpreter_enter(newtextvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td); imgp->opened = 0; } vput(newtextvp); vm_object_deallocate(imgp->object); imgp->object = NULL; imgp->credential_setid = false; if (imgp->newcred != NULL) { crfree(imgp->newcred); imgp->newcred = NULL; } imgp->execpath = NULL; free(imgp->freepath, M_TEMP); imgp->freepath = NULL; /* set new name to that of the interpreter */ NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); args->fname = imgp->interpreter_name; goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp, 0); if (disallow_high_osrel && P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { error = ENOEXEC; uprintf("Osrel %d for image %s too high\n", p->p_osrel, imgp->execpath != NULL ? imgp->execpath : "<unresolved>"); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* ABI enforces the use of Capsicum. Switch into capabilities mode. */ if (SV_PROC_FLAG(p, SV_CAPSICUM)) sys_cap_enter(td, NULL); /* * Copy out strings (args and env) and initialize stack base. */ stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); /* * Stack setup. */ error = (*p->p_sysent->sv_fixup)(&stack_base, imgp); if (error != 0) { vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } if (args->fdp != NULL) { /* Install a brand new file descriptor table. */ fdinstall_remapped(td, args->fdp); args->fdp = NULL; } else { /* * Keep on using the existing file descriptor table. For * security and other reasons, the file descriptor table * cannot be shared after an exec. */ fdunshare(td); /* close files on exec */ fdcloseexec(td); } /* * Malloc things before we need locks. */ i = exec_args_get_begin_envv(imgp->args) - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); if (oldsigacts) p->p_sigacts = newsigacts; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) p->p_flag2 &= ~P2_NOTRACE; if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); /* STOPs are no longer ignored, arrange for AST */ signotify(td); } /* * Implement image setuid/setgid installation. */ if (imgp->credential_setid) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracecred != NULL && priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED)) ktrprocexec(p, &tracecred, &tracevp); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * Both fdsetugidsafety() and fdcheckstd() may call functions * taking sleepable locks, so temporarily drop our locks. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); fdsetugidsafety(td); error = fdcheckstd(td); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto exec_fail_dealloc; PROC_LOCK(p); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, imgp->newcred, imgp->vp, interpvplabel, imgp); } #endif } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; } /* * Set the new credentials. */ if (imgp->newcred != NULL) { proc_set_cred(p, imgp->newcred); crfree(oldcred); oldcred = NULL; } /* * Store the vp for use in procfs. This vnode was referenced by namei * or fgetvp_exec. */ oldtextvp = p->p_textvp; p->p_textvp = newtextvp; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; PROC_UNLOCK(p); #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { VOP_UNLOCK(imgp->vp, 0); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } #endif /* Set values passed into the program in registers. */ (*p->p_sysent->sv_setregs)(td, imgp, (u_long)(uintptr_t)stack_base); vfs_mark_atime(imgp->vp, td->td_ucred); SDT_PROBE1(proc, , , exec__success, args->fname); exec_fail_dealloc: if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (args->fname) NDFREE(&nd, NDF_ONLY_PNBUF); if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); if (error != 0) vput(imgp->vp); else VOP_UNLOCK(imgp->vp, 0); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { if (p->p_ptevents & PTRACE_EXEC) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_EXEC) td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); } /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); } else { exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); SDT_PROBE1(proc, , , exec__failure, error); } if (imgp->newcred != NULL && oldcred != NULL) crfree(imgp->newcred); #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif exec_free_args(args); /* * Handle deferred decrement of ref counts. */ if (oldtextvp != NULL) vrele(oldtextvp); #ifdef KTRACE if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); if (euip != NULL) uifree(euip); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, 0, SIGABRT); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif /* * We don't want cpu_set_syscall_retval() to overwrite any of * the register values put in place by exec_setregs(). * Implementations of cpu_set_syscall_retval() will leave * registers unmodified when returning EJUSTRETURN. */ return (error == 0 ? EJUSTRETURN : error); }
static int link_elf_load_file(linker_class_t cls, const char *filename, linker_file_t *result) { struct nameidata nd; struct thread *td = curthread; /* XXX */ Elf_Ehdr *hdr; Elf_Shdr *shdr; Elf_Sym *es; int nbytes, i, j; vm_offset_t mapbase; size_t mapsize; int error = 0; ssize_t resid; int flags; elf_file_t ef; linker_file_t lf; int symtabindex; int symstrindex; int shstrindex; int nsym; int pb, rl, ra; int alignmask; shdr = NULL; lf = NULL; mapsize = 0; hdr = NULL; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return error; NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp->v_type != VREG) { error = ENOEXEC; goto out; } #ifdef MAC error = mac_kld_check_load(td->td_ucred, nd.ni_vp); if (error) { goto out; } #endif /* Read the elf header from the file. */ hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)hdr, sizeof(*hdr), 0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = ENOEXEC; goto out; } if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { link_elf_error(filename, "Unsupported file layout"); error = ENOEXEC; goto out; } if (hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_version != EV_CURRENT) { link_elf_error(filename, "Unsupported file version"); error = ENOEXEC; goto out; } if (hdr->e_type != ET_REL) { error = ENOSYS; goto out; } if (hdr->e_machine != ELF_TARG_MACH) { link_elf_error(filename, "Unsupported machine"); error = ENOEXEC; goto out; } lf = linker_make_file(filename, &link_elf_class); if (!lf) { error = ENOMEM; goto out; } ef = (elf_file_t) lf; ef->nprogtab = 0; ef->e_shdr = 0; ef->nreltab = 0; ef->nrelatab = 0; /* Allocate and read in the section header */ nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0 || hdr->e_shentsize != sizeof(Elf_Shdr)) { error = ENOEXEC; goto out; } shdr = malloc(nbytes, M_LINKER, M_WAITOK); ef->e_shdr = shdr; error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid) { error = ENOEXEC; goto out; } /* Scan the section header for information and table sizing. */ nsym = 0; symtabindex = -1; symstrindex = -1; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: ef->nprogtab++; break; case SHT_SYMTAB: nsym++; symtabindex = i; symstrindex = shdr[i].sh_link; break; case SHT_REL: ef->nreltab++; break; case SHT_RELA: ef->nrelatab++; break; case SHT_STRTAB: break; } } if (ef->nprogtab == 0) { link_elf_error(filename, "file has no contents"); error = ENOEXEC; goto out; } if (nsym != 1) { /* Only allow one symbol table for now */ link_elf_error(filename, "file has no valid symbol table"); error = ENOEXEC; goto out; } if (symstrindex < 0 || symstrindex > hdr->e_shnum || shdr[symstrindex].sh_type != SHT_STRTAB) { link_elf_error(filename, "file has invalid symbol strings"); error = ENOEXEC; goto out; } /* Allocate space for tracking the load chunks */ if (ef->nprogtab != 0) ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nreltab != 0) ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab), M_LINKER, M_WAITOK | M_ZERO); if (ef->nrelatab != 0) ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab), M_LINKER, M_WAITOK | M_ZERO); if (symtabindex == -1) panic("lost symbol table index"); /* Allocate space for and load the symbol table */ ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym); ef->ddbsymtab = malloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->ddbsymtab, shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } if (symstrindex == -1) panic("lost symbol string index"); /* Allocate space for and load the symbol strings */ ef->ddbstrcnt = shdr[symstrindex].sh_size; ef->ddbstrtab = malloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, ef->ddbstrtab, shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } /* Do we have a string table for the section names? */ shstrindex = -1; if (hdr->e_shstrndx != 0 && shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) { shstrindex = hdr->e_shstrndx; ef->shstrcnt = shdr[shstrindex].sh_size; ef->shstrtab = malloc(shdr[shstrindex].sh_size, M_LINKER, M_WAITOK); error = vn_rdwr(UIO_READ, nd.ni_vp, ef->shstrtab, shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } } /* Size up code/data(progbits) and bss(nobits). */ alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: alignmask = shdr[i].sh_addralign - 1; mapsize += alignmask; mapsize &= ~alignmask; mapsize += shdr[i].sh_size; break; } } /* * We know how much space we need for the text/data/bss/etc. * This stuff needs to be in a single chunk so that profiling etc * can get the bounds and gdb can associate offsets with modules */ ef->object = vm_object_allocate(OBJT_DEFAULT, round_page(mapsize) >> PAGE_SHIFT); if (ef->object == NULL) { error = ENOMEM; goto out; } ef->address = (caddr_t) vm_map_min(kernel_map); /* * In order to satisfy amd64's architectural requirements on the * location of code and data in the kernel's address space, request a * mapping that is above the kernel. */ #ifdef __amd64__ mapbase = KERNBASE; #else mapbase = VM_MIN_KERNEL_ADDRESS; #endif error = vm_map_find(kernel_map, ef->object, 0, &mapbase, round_page(mapsize), 0, VMFS_OPTIMAL_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) { vm_object_deallocate(ef->object); ef->object = 0; goto out; } /* Wire the pages */ error = vm_map_wire(kernel_map, mapbase, mapbase + round_page(mapsize), VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES); if (error != KERN_SUCCESS) { error = ENOMEM; goto out; } /* Inform the kld system about the situation */ lf->address = ef->address = (caddr_t)mapbase; lf->size = mapsize; /* * Now load code/data(progbits), zero bss(nobits), allocate space for * and load relocs */ pb = 0; rl = 0; ra = 0; alignmask = 0; for (i = 0; i < hdr->e_shnum; i++) { if (shdr[i].sh_size == 0) continue; switch (shdr[i].sh_type) { case SHT_PROGBITS: case SHT_NOBITS: alignmask = shdr[i].sh_addralign - 1; mapbase += alignmask; mapbase &= ~alignmask; if (ef->shstrtab && shdr[i].sh_name != 0) ef->progtab[pb].name = ef->shstrtab + shdr[i].sh_name; else if (shdr[i].sh_type == SHT_PROGBITS) ef->progtab[pb].name = "<<PROGBITS>>"; else ef->progtab[pb].name = "<<NOBITS>>"; if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) ef->progtab[pb].addr = dpcpu_alloc(shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].name != NULL && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) ef->progtab[pb].addr = vnet_data_alloc(shdr[i].sh_size); #endif else ef->progtab[pb].addr = (void *)(uintptr_t)mapbase; if (ef->progtab[pb].addr == NULL) { error = ENOSPC; goto out; } ef->progtab[pb].size = shdr[i].sh_size; ef->progtab[pb].sec = i; if (shdr[i].sh_type == SHT_PROGBITS) { error = vn_rdwr(UIO_READ, nd.ni_vp, ef->progtab[pb].addr, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } /* Initialize the per-cpu or vnet area. */ if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) dpcpu_copy(ef->progtab[pb].addr, shdr[i].sh_size); #ifdef VIMAGE else if (ef->progtab[pb].addr != (void *)mapbase && !strcmp(ef->progtab[pb].name, VNET_SETNAME)) vnet_data_copy(ef->progtab[pb].addr, shdr[i].sh_size); #endif } else bzero(ef->progtab[pb].addr, shdr[i].sh_size); /* Update all symbol values with the offset. */ for (j = 0; j < ef->ddbsymcnt; j++) { es = &ef->ddbsymtab[j]; if (es->st_shndx != i) continue; es->st_value += (Elf_Addr)ef->progtab[pb].addr; } mapbase += shdr[i].sh_size; pb++; break; case SHT_REL: ef->reltab[rl].rel = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel); ef->reltab[rl].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->reltab[rl].rel, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } rl++; break; case SHT_RELA: ef->relatab[ra].rela = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK); ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela); ef->relatab[ra].sec = shdr[i].sh_info; error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->relatab[ra].rela, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) goto out; if (resid != 0){ error = EINVAL; goto out; } ra++; break; } } if (pb != ef->nprogtab) panic("lost progbits"); if (rl != ef->nreltab) panic("lost reltab"); if (ra != ef->nrelatab) panic("lost relatab"); if (mapbase != (vm_offset_t)ef->address + mapsize) panic("mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n", (u_long)mapbase, ef->address, (u_long)mapsize, (u_long)(vm_offset_t)ef->address + mapsize); /* Local intra-module relocations */ link_elf_reloc_local(lf); /* Pull in dependencies */ VOP_UNLOCK(nd.ni_vp, 0); error = linker_load_dependencies(lf); vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY); if (error) goto out; /* External relocations */ error = relocate_file(ef); if (error) goto out; /* Notify MD code that a module is being loaded. */ error = elf_cpu_load_file(lf); if (error) goto out; *result = lf; out: VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); if (error && lf) linker_file_unload(lf, LINKER_UNLOAD_FORCE); if (hdr) free(hdr, M_LINKER); return error; }
/* * Destroy old address space, and allocate a new stack. * The new stack is only sgrowsiz large because it is grown * automatically on a page fault. */ int exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv) { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_object_t obj; struct rlimit rlim_stack; vm_offset_t sv_minuser, stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser && cpu_exec_vmspace_reuse(p, map)) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); /* * An exec terminates mlockall(MCL_FUTURE), ASLR state * must be re-evaluated. */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE | MAP_ASLR | MAP_ASLR_IGNSTART); vm_map_unlock(map); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } map->flags |= imgp->map_flags; /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj != NULL) { vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, sv->sv_shared_page_base, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); if (error != KERN_SUCCESS) { vm_object_deallocate(obj); return (vm_mmap_to_errno(error)); } } /* Allocate a new stack */ if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error != KERN_SUCCESS) return (vm_mmap_to_errno(error)); /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)stack_addr; return (0); }