Esempio n. 1
0
void
drm_gem_object_release(struct drm_gem_object *obj)
{

    /*
     * obj->vm_obj can be NULL for private gem objects.
     */
    vm_object_deallocate(obj->vm_obj);
}
Esempio n. 2
0
/*
 * MPSAFE
 */
static vm_object_t
phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
    vm_ooffset_t foff, struct ucred *cred)
{
	vm_object_t object, object1;
	vm_pindex_t pindex;

	/*
	 * Offset should be page aligned.
	 */
	if (foff & PAGE_MASK)
		return (NULL);

	pindex = OFF_TO_IDX(foff + PAGE_MASK + size);

	if (handle != NULL) {
		mtx_lock(&phys_pager_mtx);
		/*
		 * Look up pager, creating as necessary.
		 */
		object1 = NULL;
		object = vm_pager_object_lookup(&phys_pager_object_list, handle);
		if (object == NULL) {
			/*
			 * Allocate object and associate it with the pager.
			 */
			mtx_unlock(&phys_pager_mtx);
			object1 = vm_object_allocate(OBJT_PHYS, pindex);
			mtx_lock(&phys_pager_mtx);
			object = vm_pager_object_lookup(&phys_pager_object_list,
			    handle);
			if (object != NULL) {
				/*
				 * We raced with other thread while
				 * allocating object.
				 */
				if (pindex > object->size)
					object->size = pindex;
			} else {
				object = object1;
				object1 = NULL;
				object->handle = handle;
				TAILQ_INSERT_TAIL(&phys_pager_object_list, object,
				    pager_object_list);
			}
		} else {
			if (pindex > object->size)
				object->size = pindex;
		}
		mtx_unlock(&phys_pager_mtx);
		vm_object_deallocate(object1);
	} else {
		object = vm_object_allocate(OBJT_PHYS, pindex);
	}

	return (object);
}
Esempio n. 3
0
static void
shm_drop(struct shmfd *shmfd)
{

	if (refcount_release(&shmfd->shm_refs)) {
#ifdef MAC
		mac_posixshm_destroy(shmfd);
#endif
		vm_object_deallocate(shmfd->shm_object);
		free(shmfd, M_SHMFD);
	}
}
Esempio n. 4
0
/*
 * Helper routines to allow the backing object of a shared memory file
 * descriptor to be mapped in the kernel.
 */
int
shm_map(struct file *fp, size_t size, off_t offset, void **memp)
{
	struct shmfd *shmfd;
	vm_offset_t kva, ofs;
	vm_object_t obj;
	int rv;

	if (fp->f_type != DTYPE_SHM)
		return (EINVAL);
	shmfd = fp->f_data;
	obj = shmfd->shm_object;
	VM_OBJECT_LOCK(obj);
	/*
	 * XXXRW: This validation is probably insufficient, and subject to
	 * sign errors.  It should be fixed.
	 */
	if (offset >= shmfd->shm_size ||
	    offset + size > round_page(shmfd->shm_size)) {
		VM_OBJECT_UNLOCK(obj);
		return (EINVAL);
	}

	shmfd->shm_kmappings++;
	vm_object_reference_locked(obj);
	VM_OBJECT_UNLOCK(obj);

	/* Map the object into the kernel_map and wire it. */
	kva = vm_map_min(kernel_map);
	ofs = offset & PAGE_MASK;
	offset = trunc_page(offset);
	size = round_page(size + ofs);
	rv = vm_map_find(kernel_map, obj, offset, &kva, size,
	    VMFS_ALIGNED_SPACE, VM_PROT_READ | VM_PROT_WRITE,
	    VM_PROT_READ | VM_PROT_WRITE, 0);
	if (rv == KERN_SUCCESS) {
		rv = vm_map_wire(kernel_map, kva, kva + size,
		    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
		if (rv == KERN_SUCCESS) {
			*memp = (void *)(kva + ofs);
			return (0);
		}
		vm_map_remove(kernel_map, kva, kva + size);
	} else
		vm_object_deallocate(obj);

	/* On failure, drop our mapping reference. */
	VM_OBJECT_LOCK(obj);
	shmfd->shm_kmappings--;
	VM_OBJECT_UNLOCK(obj);

	return (vm_mmap_to_errno(rv));
}
DECLHIDDEN(int) rtR0MemAllocEx(size_t cb, uint32_t fFlags, PRTMEMHDR *ppHdr)
{
    size_t      cbAllocated = cb;
    PRTMEMHDR   pHdr        = NULL;

#ifdef RT_ARCH_AMD64
    /*
     * Things are a bit more complicated on AMD64 for executable memory
     * because we need to be in the ~2GB..~0 range for code.
     */
    if (fFlags & RTMEMHDR_FLAG_EXEC)
    {
        if (fFlags & RTMEMHDR_FLAG_ANY_CTX)
            return VERR_NOT_SUPPORTED;

# ifdef USE_KMEM_ALLOC_PROT
        pHdr = (PRTMEMHDR)kmem_alloc_prot(kernel_map, cb + sizeof(*pHdr),
                                          VM_PROT_ALL, VM_PROT_ALL, KERNBASE);
# else
        vm_object_t pVmObject = NULL;
        vm_offset_t Addr = KERNBASE;
        cbAllocated = RT_ALIGN_Z(cb + sizeof(*pHdr), PAGE_SIZE);

        pVmObject = vm_object_allocate(OBJT_DEFAULT, cbAllocated >> PAGE_SHIFT);
        if (!pVmObject)
            return VERR_NO_EXEC_MEMORY;

        /* Addr contains a start address vm_map_find will start searching for suitable space at. */
        int rc = vm_map_find(kernel_map, pVmObject, 0, &Addr,
                             cbAllocated, TRUE, VM_PROT_ALL, VM_PROT_ALL, 0);
        if (rc == KERN_SUCCESS)
        {
            rc = vm_map_wire(kernel_map, Addr, Addr + cbAllocated,
                             VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
            if (rc == KERN_SUCCESS)
            {
                pHdr = (PRTMEMHDR)Addr;

                if (fFlags & RTMEMHDR_FLAG_ZEROED)
                    bzero(pHdr, cbAllocated);
            }
            else
                vm_map_remove(kernel_map,
                              Addr,
                              Addr + cbAllocated);
        }
        else
            vm_object_deallocate(pVmObject);
# endif
    }
    else
#endif
    {
Esempio n. 6
0
static int rtR0MemObjFreeBSDAllocPhysPages(PPRTR0MEMOBJINTERNAL ppMem, RTR0MEMOBJTYPE enmType,
                                           size_t cb,
                                           RTHCPHYS PhysHighest, size_t uAlignment,
                                           bool fContiguous, int rcNoMem)
{
    uint32_t   cPages = atop(cb);
    vm_paddr_t VmPhysAddrHigh;

    /* create the object. */
    PRTR0MEMOBJFREEBSD pMemFreeBSD = (PRTR0MEMOBJFREEBSD)rtR0MemObjNew(sizeof(*pMemFreeBSD),
                                                                       enmType, NULL, cb);
    if (!pMemFreeBSD)
        return VERR_NO_MEMORY;

    pMemFreeBSD->pObject = vm_object_allocate(OBJT_PHYS, atop(cb));

    if (PhysHighest != NIL_RTHCPHYS)
        VmPhysAddrHigh = PhysHighest;
    else
        VmPhysAddrHigh = ~(vm_paddr_t)0;

    int rc = rtR0MemObjFreeBSDPhysAllocHelper(pMemFreeBSD->pObject, cPages, VmPhysAddrHigh,
                                              uAlignment, fContiguous, true, rcNoMem);
    if (RT_SUCCESS(rc))
    {
        if (fContiguous)
        {
            Assert(enmType == RTR0MEMOBJTYPE_PHYS);
#if __FreeBSD_version >= 1000030
            VM_OBJECT_WLOCK(pMemFreeBSD->pObject);
#else
            VM_OBJECT_LOCK(pMemFreeBSD->pObject);
#endif
            pMemFreeBSD->Core.u.Phys.PhysBase = VM_PAGE_TO_PHYS(vm_page_find_least(pMemFreeBSD->pObject, 0));
#if __FreeBSD_version >= 1000030
            VM_OBJECT_WUNLOCK(pMemFreeBSD->pObject);
#else
            VM_OBJECT_UNLOCK(pMemFreeBSD->pObject);
#endif
            pMemFreeBSD->Core.u.Phys.fAllocated = true;
        }

        *ppMem = &pMemFreeBSD->Core;
    }
    else
    {
        vm_object_deallocate(pMemFreeBSD->pObject);
        rtR0MemObjDelete(&pMemFreeBSD->Core);
    }

    return rc;
}
Esempio n. 7
0
vm_object_t
vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len,
	       vm_paddr_t hpa)
{
	int error;
	vm_object_t obj;
	struct sglist *sg;

	sg = sglist_alloc(1, M_WAITOK);
	error = sglist_append_phys(sg, hpa, len);
	KASSERT(error == 0, ("error %d appending physaddr to sglist", error));

	obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL);
	if (obj != NULL) {
		/*
		 * VT-x ignores the MTRR settings when figuring out the
		 * memory type for translations obtained through EPT.
		 *
		 * Therefore we explicitly force the pages provided by
		 * this object to be mapped as uncacheable.
		 */
		VM_OBJECT_WLOCK(obj);
		error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE);
		VM_OBJECT_WUNLOCK(obj);
		if (error != KERN_SUCCESS) {
			panic("vmm_mmio_alloc: vm_object_set_memattr error %d",
				error);
		}
		error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
				    VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0);
		if (error != KERN_SUCCESS) {
			vm_object_deallocate(obj);
			obj = NULL;
		}
	}

	/*
	 * Drop the reference on the sglist.
	 *
	 * If the scatter/gather object was successfully allocated then it
	 * has incremented the reference count on the sglist. Dropping the
	 * initial reference count ensures that the sglist will be freed
	 * when the object is deallocated.
	 * 
	 * If the object could not be allocated then we end up freeing the
	 * sglist.
	 */
	sglist_free(sg);

	return (obj);
}
Esempio n. 8
0
int ttm_tt_swapin(struct ttm_tt *ttm)
{
	vm_object_t obj;
	vm_page_t from_page, to_page;
	int i, ret, rv;

	obj = ttm->swap_storage;

	VM_OBJECT_LOCK(obj);
	vm_object_pip_add(obj, 1);
	for (i = 0; i < ttm->num_pages; ++i) {
		from_page = vm_page_grab(obj, i, VM_ALLOC_NORMAL |
						 VM_ALLOC_RETRY);
		if (from_page->valid != VM_PAGE_BITS_ALL) {
			if (vm_pager_has_page(obj, i)) {
				rv = vm_pager_get_page(obj, &from_page, 1);
				if (rv != VM_PAGER_OK) {
					vm_page_free(from_page);
					ret = -EIO;
					goto err_ret;
				}
			} else {
				vm_page_zero_invalid(from_page, TRUE);
			}
		}
		to_page = ttm->pages[i];
		if (unlikely(to_page == NULL)) {
			ret = -ENOMEM;
			vm_page_wakeup(from_page);
			goto err_ret;
		}
		pmap_copy_page(VM_PAGE_TO_PHYS(from_page),
			       VM_PAGE_TO_PHYS(to_page));
		vm_page_wakeup(from_page);
	}
	vm_object_pip_wakeup(obj);
	VM_OBJECT_UNLOCK(obj);

	if (!(ttm->page_flags & TTM_PAGE_FLAG_PERSISTENT_SWAP))
		vm_object_deallocate(obj);
	ttm->swap_storage = NULL;
	ttm->page_flags &= ~TTM_PAGE_FLAG_SWAPPED;
	return (0);

err_ret:
	vm_object_pip_wakeup(obj);
	VM_OBJECT_UNLOCK(obj);
	return (ret);
}
Esempio n. 9
0
static int rtR0MemObjFreeBSDAllocHelper(PRTR0MEMOBJFREEBSD pMemFreeBSD, bool fExecutable,
                                        vm_paddr_t VmPhysAddrHigh, bool fContiguous, int rcNoMem)
{
    vm_offset_t MapAddress = vm_map_min(kernel_map);
    size_t      cPages = atop(pMemFreeBSD->Core.cb);
    int         rc;

    pMemFreeBSD->pObject = vm_object_allocate(OBJT_PHYS, cPages);

    /* No additional object reference for auto-deallocation upon unmapping. */
#if __FreeBSD_version >= 1000055
    rc = vm_map_find(kernel_map, pMemFreeBSD->pObject, 0,
                     &MapAddress, pMemFreeBSD->Core.cb, 0, VMFS_ANY_SPACE,
                     fExecutable ? VM_PROT_ALL : VM_PROT_RW, VM_PROT_ALL, 0);
#else
    rc = vm_map_find(kernel_map, pMemFreeBSD->pObject, 0,
                     &MapAddress, pMemFreeBSD->Core.cb, VMFS_ANY_SPACE,
                     fExecutable ? VM_PROT_ALL : VM_PROT_RW, VM_PROT_ALL, 0);
#endif

    if (rc == KERN_SUCCESS)
    {
        rc = rtR0MemObjFreeBSDPhysAllocHelper(pMemFreeBSD->pObject, cPages,
                                              VmPhysAddrHigh, PAGE_SIZE, fContiguous,
                                              false, rcNoMem);
        if (RT_SUCCESS(rc))
        {
            vm_map_wire(kernel_map, MapAddress, MapAddress + pMemFreeBSD->Core.cb,
                        VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);

            /* Store start address */
            pMemFreeBSD->Core.pv = (void *)MapAddress;
            return VINF_SUCCESS;
        }

        vm_map_remove(kernel_map, MapAddress, MapAddress + pMemFreeBSD->Core.cb);
    }
    else
    {
        rc = rcNoMem; /** @todo fix translation (borrow from darwin) */
        vm_object_deallocate(pMemFreeBSD->pObject);
    }

    rtR0MemObjDelete(&pMemFreeBSD->Core);
    return rc;
}
Esempio n. 10
0
void
efi_destroy_1t1_map(void)
{
	vm_page_t m;

	if (obj_1t1_pt != NULL) {
		VM_OBJECT_RLOCK(obj_1t1_pt);
		TAILQ_FOREACH(m, &obj_1t1_pt->memq, listq)
			m->wire_count = 0;
		vm_wire_sub(obj_1t1_pt->resident_page_count);
		VM_OBJECT_RUNLOCK(obj_1t1_pt);
		vm_object_deallocate(obj_1t1_pt);
	}

	obj_1t1_pt = NULL;
	efi_pml4 = NULL;
	efi_pml4_page = NULL;
}
Esempio n. 11
0
static int
spigen_close(struct cdev *cdev, int fflag, int devtype, struct thread *td)
{
    device_t dev = cdev->si_drv1;
    struct spigen_softc *sc = device_get_softc(dev);

    mtx_lock(&sc->sc_mtx);
    if (sc->sc_mmap_buffer != NULL) {
        pmap_qremove(sc->sc_mmap_kvaddr,
                     sc->sc_mmap_buffer_size / PAGE_SIZE);
        kva_free(sc->sc_mmap_kvaddr, sc->sc_mmap_buffer_size);
        sc->sc_mmap_kvaddr = 0;
        vm_object_deallocate(sc->sc_mmap_buffer);
        sc->sc_mmap_buffer = NULL;
        sc->sc_mmap_buffer_size = 0;
    }
    mtx_unlock(&sc->sc_mtx);
    return (0);
}
Esempio n. 12
0
struct drm_gem_object *
drm_gem_object_alloc(struct drm_device *dev, size_t size)
{
	struct drm_gem_object *obj;

	obj = malloc(sizeof(*obj), DRM_MEM_DRIVER, M_WAITOK | M_ZERO);
	if (drm_gem_object_init(dev, obj, size) != 0)
		goto free;

	if (dev->driver->gem_init_object != NULL &&
	    dev->driver->gem_init_object(obj) != 0)
		goto dealloc;
	return (obj);
dealloc:
	vm_object_deallocate(obj->vm_obj);
free:
	free(obj, DRM_MEM_DRIVER);
	return (NULL);
}
Esempio n. 13
0
void
efi_destroy_1t1_map(void)
{
	vm_page_t m;

	if (obj_1t1_pt != NULL) {
		VM_OBJECT_RLOCK(obj_1t1_pt);
		TAILQ_FOREACH(m, &obj_1t1_pt->memq, listq)
			m->wire_count = 0;
		atomic_subtract_int(&vm_cnt.v_wire_count,
		    obj_1t1_pt->resident_page_count);
		VM_OBJECT_RUNLOCK(obj_1t1_pt);
		vm_object_deallocate(obj_1t1_pt);
	}

	obj_1t1_pt = NULL;
	efi_l0 = NULL;
	efi_l0_page = NULL;
}
Esempio n. 14
0
static void
pscnv_gem_pager_dtor(void *handle)
{
	struct drm_gem_object *gem_obj = handle;
	struct pscnv_bo *bo = gem_obj->driver_private;
	struct drm_device *dev = gem_obj->dev;
	vm_object_t devobj;

	DRM_LOCK(dev);
	devobj = cdev_pager_lookup(handle);

	if (devobj != NULL) {
		vm_size_t page_count = OFF_TO_IDX(bo->size);
		vm_page_t m;
		int i;
		VM_OBJECT_LOCK(devobj);
		for (i = 0; i < page_count; i++) {
			m = vm_page_lookup(devobj, i);
			if (!m)
				continue;
			if (pscnv_mem_debug > 0)
				NV_WARN(dev, "Freeing %010llx + %08llx (%p\n", bo->start, i * PAGE_SIZE, m);
			cdev_pager_free_page(devobj, m);
		}
		VM_OBJECT_UNLOCK(devobj);
		vm_object_deallocate(devobj);
	}
	else {
		DRM_UNLOCK(dev);
		NV_ERROR(dev, "Could not find handle %p bo %p\n", handle, bo);
		return;
	}
	if (pscnv_mem_debug > 0)
		NV_WARN(dev, "Freed %010llx (%p)\n", bo->start, bo);
	//kfree(bo->fake_pages);

	if (bo->chan)
		pscnv_chan_unref(bo->chan);
	else
		drm_gem_object_unreference_unlocked(gem_obj);
	DRM_UNLOCK(dev);
}
Esempio n. 15
0
void ttm_tt_destroy(struct ttm_tt *ttm)
{
	if (unlikely(ttm == NULL))
		return;

	if (ttm->state == tt_bound) {
		ttm_tt_unbind(ttm);
	}

	if (likely(ttm->pages != NULL)) {
		ttm->bdev->driver->ttm_tt_unpopulate(ttm);
	}

	if (!(ttm->page_flags & TTM_PAGE_FLAG_PERSISTENT_SWAP) &&
	    ttm->swap_storage)
		vm_object_deallocate(ttm->swap_storage);

	ttm->swap_storage = NULL;
	ttm->func->destroy(ttm);
}
Esempio n. 16
0
void
ttm_bo_release_mmap(struct ttm_buffer_object *bo)
{
	vm_object_t vm_obj;
	vm_page_t m;
	int i;

	vm_obj = cdev_pager_lookup(bo);
	if (vm_obj == NULL)
		return;

	VM_OBJECT_WLOCK(vm_obj);
	for (i = 0; i < bo->num_pages; i++) {
		m = vm_page_lookup_busy_wait(vm_obj, i, TRUE, "ttm_unm");
		if (m == NULL)
			continue;
		cdev_pager_free_page(vm_obj, m);
	}
	VM_OBJECT_WUNLOCK(vm_obj);

	vm_object_deallocate(vm_obj);
}
Esempio n. 17
0
static void
unlock_and_deallocate(struct faultstate *fs)
{

	vm_object_pip_wakeup(fs->object);
	VM_OBJECT_WUNLOCK(fs->object);
	if (fs->object != fs->first_object) {
		VM_OBJECT_WLOCK(fs->first_object);
		vm_page_lock(fs->first_m);
		vm_page_free(fs->first_m);
		vm_page_unlock(fs->first_m);
		vm_object_pip_wakeup(fs->first_object);
		VM_OBJECT_WUNLOCK(fs->first_object);
		fs->first_m = NULL;
	}
	vm_object_deallocate(fs->first_object);
	unlock_map(fs);	
	if (fs->vp != NULL) { 
		vput(fs->vp);
		fs->vp = NULL;
	}
}
Esempio n. 18
0
static void
link_elf_unload_file(linker_file_t file)
{
    elf_file_t ef = file->priv;

    if (ef) {
#ifdef SPARSE_MAPPING
	if (ef->object) {
	    vm_map_remove(&kernel_map, (vm_offset_t) ef->address,
			  (vm_offset_t) ef->address
			  + (ef->object->size << PAGE_SHIFT));
	    vm_object_deallocate(ef->object);
	}
#else
	if (ef->address)
	    kfree(ef->address, M_LINKER);
#endif
	if (ef->symbase)
	    kfree(ef->symbase, M_LINKER);
	if (ef->strbase)
	    kfree(ef->strbase, M_LINKER);
	kfree(ef, M_LINKER);
    }
}
Esempio n. 19
0
vm_object_t
vmm_mem_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len)
{
	int error;
	vm_object_t obj;

	if (gpa & PAGE_MASK)
		panic("vmm_mem_alloc: invalid gpa %#lx", gpa);

	if (len == 0 || (len & PAGE_MASK) != 0)
		panic("vmm_mem_alloc: invalid allocation size %lu", len);

	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
	if (obj != NULL) {
		error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0,
				    VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0);
		if (error != KERN_SUCCESS) {
			vm_object_deallocate(obj);
			obj = NULL;
		}
	}

	return (obj);
}
Esempio n. 20
0
/*
 *	Routine:
 *		vm_fault_copy_entry
 *	Function:
 *		Create new shadow object backing dst_entry with private copy of
 *		all underlying pages. When src_entry is equal to dst_entry,
 *		function implements COW for wired-down map entry. Otherwise,
 *		it forks wired entry into dst_map.
 *
 *	In/out conditions:
 *		The source and destination maps must be locked for write.
 *		The source map entry must be wired down (or be a sharing map
 *		entry corresponding to a main map entry that is wired down).
 */
void
vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
    vm_map_entry_t dst_entry, vm_map_entry_t src_entry,
    vm_ooffset_t *fork_charge)
{
	vm_object_t backing_object, dst_object, object, src_object;
	vm_pindex_t dst_pindex, pindex, src_pindex;
	vm_prot_t access, prot;
	vm_offset_t vaddr;
	vm_page_t dst_m;
	vm_page_t src_m;
	boolean_t src_readonly, upgrade;

#ifdef	lint
	src_map++;
#endif	/* lint */

	upgrade = src_entry == dst_entry;

	src_object = src_entry->object.vm_object;
	src_pindex = OFF_TO_IDX(src_entry->offset);
	src_readonly = (src_entry->protection & VM_PROT_WRITE) == 0;

	/*
	 * Create the top-level object for the destination entry. (Doesn't
	 * actually shadow anything - we copy the pages directly.)
	 */
	dst_object = vm_object_allocate(OBJT_DEFAULT,
	    OFF_TO_IDX(dst_entry->end - dst_entry->start));
#if VM_NRESERVLEVEL > 0
	dst_object->flags |= OBJ_COLORED;
	dst_object->pg_color = atop(dst_entry->start);
#endif

	VM_OBJECT_WLOCK(dst_object);
	KASSERT(upgrade || dst_entry->object.vm_object == NULL,
	    ("vm_fault_copy_entry: vm_object not NULL"));
	dst_entry->object.vm_object = dst_object;
	dst_entry->offset = 0;
	dst_object->charge = dst_entry->end - dst_entry->start;
	if (fork_charge != NULL) {
		KASSERT(dst_entry->cred == NULL,
		    ("vm_fault_copy_entry: leaked swp charge"));
		dst_object->cred = curthread->td_ucred;
		crhold(dst_object->cred);
		*fork_charge += dst_object->charge;
	} else {
		dst_object->cred = dst_entry->cred;
		dst_entry->cred = NULL;
	}
	access = prot = dst_entry->protection;
	/*
	 * If not an upgrade, then enter the mappings in the pmap as
	 * read and/or execute accesses.  Otherwise, enter them as
	 * write accesses.
	 *
	 * A writeable large page mapping is only created if all of
	 * the constituent small page mappings are modified. Marking
	 * PTEs as modified on inception allows promotion to happen
	 * without taking potentially large number of soft faults.
	 */
	if (!upgrade)
		access &= ~VM_PROT_WRITE;

	/*
	 * Loop through all of the virtual pages within the entry's
	 * range, copying each page from the source object to the
	 * destination object.  Since the source is wired, those pages
	 * must exist.  In contrast, the destination is pageable.
	 * Since the destination object does share any backing storage
	 * with the source object, all of its pages must be dirtied,
	 * regardless of whether they can be written.
	 */
	for (vaddr = dst_entry->start, dst_pindex = 0;
	    vaddr < dst_entry->end;
	    vaddr += PAGE_SIZE, dst_pindex++) {

		/*
		 * Allocate a page in the destination object.
		 */
		do {
			dst_m = vm_page_alloc(dst_object, dst_pindex,
			    VM_ALLOC_NORMAL);
			if (dst_m == NULL) {
				VM_OBJECT_WUNLOCK(dst_object);
				VM_WAIT;
				VM_OBJECT_WLOCK(dst_object);
			}
		} while (dst_m == NULL);

		/*
		 * Find the page in the source object, and copy it in.
		 * (Because the source is wired down, the page will be in
		 * memory.)
		 */
		VM_OBJECT_WLOCK(src_object);
		object = src_object;
		pindex = src_pindex + dst_pindex;
		while ((src_m = vm_page_lookup(object, pindex)) == NULL &&
		    src_readonly &&
		    (backing_object = object->backing_object) != NULL) {
			/*
			 * Allow fallback to backing objects if we are reading.
			 */
			VM_OBJECT_WLOCK(backing_object);
			pindex += OFF_TO_IDX(object->backing_object_offset);
			VM_OBJECT_WUNLOCK(object);
			object = backing_object;
		}
		if (src_m == NULL)
			panic("vm_fault_copy_wired: page missing");
		pmap_copy_page(src_m, dst_m);
		VM_OBJECT_WUNLOCK(object);
		dst_m->valid = VM_PAGE_BITS_ALL;
		dst_m->dirty = VM_PAGE_BITS_ALL;
		VM_OBJECT_WUNLOCK(dst_object);

		/*
		 * Enter it in the pmap. If a wired, copy-on-write
		 * mapping is being replaced by a write-enabled
		 * mapping, then wire that new mapping.
		 */
		pmap_enter(dst_map->pmap, vaddr, access, dst_m, prot, upgrade);

		/*
		 * Mark it no longer busy, and put it on the active list.
		 */
		VM_OBJECT_WLOCK(dst_object);
		
		if (upgrade) {
			vm_page_lock(src_m);
			vm_page_unwire(src_m, 0);
			vm_page_unlock(src_m);

			vm_page_lock(dst_m);
			vm_page_wire(dst_m);
			vm_page_unlock(dst_m);
		} else {
			vm_page_lock(dst_m);
			vm_page_activate(dst_m);
			vm_page_unlock(dst_m);
		}
		vm_page_wakeup(dst_m);
	}
	VM_OBJECT_WUNLOCK(dst_object);
	if (upgrade) {
		dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY);
		vm_object_deallocate(src_object);
	}
}
Esempio n. 21
0
static void
os_balloonobject_delete(void)
{
   vm_object_deallocate(global_state.vmobject);
}
Esempio n. 22
0
static int
link_elf_obj_load_file(const char *filename, linker_file_t * result)
{
	struct nlookupdata nd;
	struct thread  *td = curthread;	/* XXX */
	struct proc    *p = td->td_proc;
	char           *pathname;
	struct vnode   *vp;
	Elf_Ehdr       *hdr;
	Elf_Shdr       *shdr;
	Elf_Sym        *es;
	int		nbytes, i, j;
	vm_offset_t	mapbase;
	size_t		mapsize;
	int		error = 0;
	int		resid;
	elf_file_t	ef;
	linker_file_t	lf;
	int		symtabindex;
	int		symstrindex;
	int		shstrindex;
	int		nsym;
	int		pb, rl, ra;
	int		alignmask;

	/* XXX Hack for firmware loading where p == NULL */
	if (p == NULL) {
		p = &proc0;
	}

	KKASSERT(p != NULL);
	if (p->p_ucred == NULL) {
		kprintf("link_elf_obj_load_file: cannot load '%s' from filesystem"
			" this early\n", filename);
		return ENOENT;
	}
	shdr = NULL;
	lf = NULL;
	mapsize = 0;
	hdr = NULL;
	pathname = linker_search_path(filename);
	if (pathname == NULL)
		return ENOENT;

	error = nlookup_init(&nd, pathname, UIO_SYSSPACE, NLC_FOLLOW | NLC_LOCKVP);
	if (error == 0)
		error = vn_open(&nd, NULL, FREAD, 0);
	kfree(pathname, M_LINKER);
	if (error) {
		nlookup_done(&nd);
		return error;
	}
	vp = nd.nl_open_vp;
	nd.nl_open_vp = NULL;
	nlookup_done(&nd);

	/*
	 * Read the elf header from the file.
	 */
	hdr = kmalloc(sizeof(*hdr), M_LINKER, M_WAITOK);
	error = vn_rdwr(UIO_READ, vp, (void *)hdr, sizeof(*hdr), 0,
			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
	if (error)
		goto out;
	if (resid != 0) {
		error = ENOEXEC;
		goto out;
	}
	if (!IS_ELF(*hdr)) {
		error = ENOEXEC;
		goto out;
	}

	if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
	    || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
		link_elf_obj_error(filename, "Unsupported file layout");
		error = ENOEXEC;
		goto out;
	}
	if (hdr->e_ident[EI_VERSION] != EV_CURRENT
	    || hdr->e_version != EV_CURRENT) {
		link_elf_obj_error(filename, "Unsupported file version");
		error = ENOEXEC;
		goto out;
	}
	if (hdr->e_type != ET_REL) {
		error = ENOSYS;
		goto out;
	}
	if (hdr->e_machine != ELF_TARG_MACH) {
		link_elf_obj_error(filename, "Unsupported machine");
		error = ENOEXEC;
		goto out;
	}

	ef = kmalloc(sizeof(struct elf_file), M_LINKER, M_WAITOK | M_ZERO);
	lf = linker_make_file(filename, ef, &link_elf_obj_file_ops);
	if (lf == NULL) {
		kfree(ef, M_LINKER);
		error = ENOMEM;
		goto out;
	}
	ef->nprogtab = 0;
	ef->e_shdr = NULL;
	ef->nreltab = 0;
	ef->nrelatab = 0;

	/* Allocate and read in the section header */
	nbytes = hdr->e_shnum * hdr->e_shentsize;
	if (nbytes == 0 || hdr->e_shoff == 0 ||
	    hdr->e_shentsize != sizeof(Elf_Shdr)) {
		error = ENOEXEC;
		goto out;
	}
	shdr = kmalloc(nbytes, M_LINKER, M_WAITOK);
	ef->e_shdr = shdr;
	error = vn_rdwr(UIO_READ, vp, (caddr_t) shdr, nbytes, hdr->e_shoff,
			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
	if (error)
		goto out;
	if (resid) {
		error = ENOEXEC;
		goto out;
	}
	/* Scan the section header for information and table sizing. */
	nsym = 0;
	symtabindex = -1;
	symstrindex = -1;
	for (i = 0; i < hdr->e_shnum; i++) {
		if (shdr[i].sh_size == 0)
			continue;
		switch (shdr[i].sh_type) {
		case SHT_PROGBITS:
		case SHT_NOBITS:
			ef->nprogtab++;
			break;
		case SHT_SYMTAB:
			nsym++;
			symtabindex = i;
			symstrindex = shdr[i].sh_link;
			break;
		case SHT_REL:
			ef->nreltab++;
			break;
		case SHT_RELA:
			ef->nrelatab++;
			break;
		case SHT_STRTAB:
			break;
		}
	}
	if (ef->nprogtab == 0) {
		link_elf_obj_error(filename, "file has no contents");
		error = ENOEXEC;
		goto out;
	}
	if (nsym != 1) {
		/* Only allow one symbol table for now */
		link_elf_obj_error(filename, "file has no valid symbol table");
		error = ENOEXEC;
		goto out;
	}
	if (symstrindex < 0 || symstrindex > hdr->e_shnum ||
	    shdr[symstrindex].sh_type != SHT_STRTAB) {
		link_elf_obj_error(filename, "file has invalid symbol strings");
		error = ENOEXEC;
		goto out;
	}
	/* Allocate space for tracking the load chunks */
	if (ef->nprogtab != 0)
		ef->progtab = kmalloc(ef->nprogtab * sizeof(*ef->progtab),
				      M_LINKER, M_WAITOK | M_ZERO);
	if (ef->nreltab != 0)
		ef->reltab = kmalloc(ef->nreltab * sizeof(*ef->reltab),
				     M_LINKER, M_WAITOK | M_ZERO);
	if (ef->nrelatab != 0)
		ef->relatab = kmalloc(ef->nrelatab * sizeof(*ef->relatab),
				      M_LINKER, M_WAITOK | M_ZERO);
	if ((ef->nprogtab != 0 && ef->progtab == NULL) ||
	    (ef->nreltab != 0 && ef->reltab == NULL) ||
	    (ef->nrelatab != 0 && ef->relatab == NULL)) {
		error = ENOMEM;
		goto out;
	}
	if (symtabindex == -1)
		panic("lost symbol table index");
	/* Allocate space for and load the symbol table */
	ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym);
	ef->ddbsymtab = kmalloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK);
	error = vn_rdwr(UIO_READ, vp, (void *)ef->ddbsymtab,
			shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset,
			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
	if (error)
		goto out;
	if (resid != 0) {
		error = EINVAL;
		goto out;
	}
	if (symstrindex == -1)
		panic("lost symbol string index");
	/* Allocate space for and load the symbol strings */
	ef->ddbstrcnt = shdr[symstrindex].sh_size;
	ef->ddbstrtab = kmalloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK);
	error = vn_rdwr(UIO_READ, vp, ef->ddbstrtab,
			shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset,
			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
	if (error)
		goto out;
	if (resid != 0) {
		error = EINVAL;
		goto out;
	}
	/* Do we have a string table for the section names?  */
	shstrindex = -1;
	if (hdr->e_shstrndx != 0 &&
	    shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) {
		shstrindex = hdr->e_shstrndx;
		ef->shstrcnt = shdr[shstrindex].sh_size;
		ef->shstrtab = kmalloc(shdr[shstrindex].sh_size, M_LINKER,
				       M_WAITOK);
		error = vn_rdwr(UIO_READ, vp, ef->shstrtab,
				shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset,
				UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
		if (error)
			goto out;
		if (resid != 0) {
			error = EINVAL;
			goto out;
		}
	}
	/* Size up code/data(progbits) and bss(nobits). */
	alignmask = 0;
	for (i = 0; i < hdr->e_shnum; i++) {
		if (shdr[i].sh_size == 0)
			continue;
		switch (shdr[i].sh_type) {
		case SHT_PROGBITS:
		case SHT_NOBITS:
			alignmask = shdr[i].sh_addralign - 1;
			mapsize += alignmask;
			mapsize &= ~alignmask;
			mapsize += shdr[i].sh_size;
			break;
		}
	}

	/*
	 * We know how much space we need for the text/data/bss/etc. This
	 * stuff needs to be in a single chunk so that profiling etc can get
	 * the bounds and gdb can associate offsets with modules
	 */
	ef->object = vm_object_allocate(OBJT_DEFAULT,
					round_page(mapsize) >> PAGE_SHIFT);
	if (ef->object == NULL) {
		error = ENOMEM;
		goto out;
	}
	vm_object_hold(ef->object);
	vm_object_reference_locked(ef->object);
	ef->address = (caddr_t) vm_map_min(&kernel_map);
	ef->bytes = 0;

	/*
	 * In order to satisfy x86_64's architectural requirements on the
	 * location of code and data in the kernel's address space, request a
	 * mapping that is above the kernel.
	 *
	 * vkernel64's text+data is outside the managed VM space entirely.
	 */
#if defined(__x86_64__) && defined(_KERNEL_VIRTUAL)
	error = vkernel_module_memory_alloc(&mapbase, round_page(mapsize));
	vm_object_drop(ef->object);
#else
	mapbase = KERNBASE;
	error = vm_map_find(&kernel_map, ef->object, NULL,
			    0, &mapbase, round_page(mapsize),
			    PAGE_SIZE,
			    TRUE, VM_MAPTYPE_NORMAL,
			    VM_PROT_ALL, VM_PROT_ALL, FALSE);
	vm_object_drop(ef->object);
	if (error) {
		vm_object_deallocate(ef->object);
		ef->object = NULL;
		goto out;
	}
	/* Wire the pages */
	error = vm_map_wire(&kernel_map, mapbase,
			    mapbase + round_page(mapsize), 0);
#endif
	if (error != KERN_SUCCESS) {
		error = ENOMEM;
		goto out;
	}
	/* Inform the kld system about the situation */
	lf->address = ef->address = (caddr_t) mapbase;
	lf->size = round_page(mapsize);
	ef->bytes = mapsize;

	/*
	 * Now load code/data(progbits), zero bss(nobits), allocate space for
	 * and load relocs
	 */
	pb = 0;
	rl = 0;
	ra = 0;
	alignmask = 0;
	for (i = 0; i < hdr->e_shnum; i++) {
		if (shdr[i].sh_size == 0)
			continue;
		switch (shdr[i].sh_type) {
		case SHT_PROGBITS:
		case SHT_NOBITS:
			alignmask = shdr[i].sh_addralign - 1;
			mapbase += alignmask;
			mapbase &= ~alignmask;
			if (ef->shstrtab && shdr[i].sh_name != 0)
				ef->progtab[pb].name =
					ef->shstrtab + shdr[i].sh_name;
			else if (shdr[i].sh_type == SHT_PROGBITS)
				ef->progtab[pb].name = "<<PROGBITS>>";
			else
				ef->progtab[pb].name = "<<NOBITS>>";
#if 0
			if (ef->progtab[pb].name != NULL &&
			    !strcmp(ef->progtab[pb].name, "set_pcpu"))
				ef->progtab[pb].addr =
					dpcpu_alloc(shdr[i].sh_size);
#ifdef VIMAGE
			else if (ef->progtab[pb].name != NULL &&
				 !strcmp(ef->progtab[pb].name, VNET_SETNAME))
				ef->progtab[pb].addr =
					vnet_data_alloc(shdr[i].sh_size);
#endif
			else
#endif
				ef->progtab[pb].addr =
					(void *)(uintptr_t) mapbase;
			if (ef->progtab[pb].addr == NULL) {
				error = ENOSPC;
				goto out;
			}
			ef->progtab[pb].size = shdr[i].sh_size;
			ef->progtab[pb].sec = i;
			if (shdr[i].sh_type == SHT_PROGBITS) {
				error = vn_rdwr(UIO_READ, vp,
						ef->progtab[pb].addr,
						shdr[i].sh_size, shdr[i].sh_offset,
						UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred,
						&resid);
				if (error)
					goto out;
				if (resid != 0) {
					error = EINVAL;
					goto out;
				}
#if 0
				/* Initialize the per-cpu or vnet area. */
				if (ef->progtab[pb].addr != (void *)mapbase &&
				    !strcmp(ef->progtab[pb].name, "set_pcpu"))
					dpcpu_copy(ef->progtab[pb].addr,
						   shdr[i].sh_size);
#ifdef VIMAGE
				else if (ef->progtab[pb].addr !=
					 (void *)mapbase &&
					 !strcmp(ef->progtab[pb].name, VNET_SETNAME))
					vnet_data_copy(ef->progtab[pb].addr,
						       shdr[i].sh_size);
#endif
#endif
			} else
				bzero(ef->progtab[pb].addr, shdr[i].sh_size);

			/* Update all symbol values with the offset. */
			for (j = 0; j < ef->ddbsymcnt; j++) {
				es = &ef->ddbsymtab[j];
				if (es->st_shndx != i)
					continue;
				es->st_value += (Elf_Addr) ef->progtab[pb].addr;
			}
			mapbase += shdr[i].sh_size;
			pb++;
			break;
		case SHT_REL:
			ef->reltab[rl].rel = kmalloc(shdr[i].sh_size, M_LINKER, M_WAITOK);
			ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel);
			ef->reltab[rl].sec = shdr[i].sh_info;
			error = vn_rdwr(UIO_READ, vp,
					(void *)ef->reltab[rl].rel,
					shdr[i].sh_size, shdr[i].sh_offset,
					UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
			if (error)
				goto out;
			if (resid != 0) {
				error = EINVAL;
				goto out;
			}
			rl++;
			break;
		case SHT_RELA:
			ef->relatab[ra].rela = kmalloc(shdr[i].sh_size, M_LINKER, M_WAITOK);
			ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela);
			ef->relatab[ra].sec = shdr[i].sh_info;
			error = vn_rdwr(UIO_READ, vp,
					(void *)ef->relatab[ra].rela,
					shdr[i].sh_size, shdr[i].sh_offset,
					UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
			if (error)
				goto out;
			if (resid != 0) {
				error = EINVAL;
				goto out;
			}
			ra++;
			break;
		}
	}
	if (pb != ef->nprogtab)
		panic("lost progbits");
	if (rl != ef->nreltab)
		panic("lost reltab");
	if (ra != ef->nrelatab)
		panic("lost relatab");
	if (mapbase != (vm_offset_t) ef->address + mapsize)
		panic("mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)",
		      mapbase, ef->address, mapsize,
		      (vm_offset_t) ef->address + mapsize);

	/* Local intra-module relocations */
	link_elf_obj_reloc_local(lf);

	/* Pull in dependencies */
	error = linker_load_dependencies(lf);
	if (error)
		goto out;

	/* External relocations */
	error = relocate_file(lf);
	if (error)
		goto out;

	*result = lf;

out:
	if (error && lf)
		linker_file_unload(lf /*, LINKER_UNLOAD_FORCE */);
	if (hdr)
		kfree(hdr, M_LINKER);
	vn_unlock(vp);
	vn_close(vp, FREAD, NULL);

	return error;
}
Esempio n. 23
0
static void
link_elf_obj_unload_file(linker_file_t file)
{
	elf_file_t	ef = file->priv;
	int i;

	if (ef->progtab) {
		for (i = 0; i < ef->nprogtab; i++) {
			if (ef->progtab[i].size == 0)
				continue;
			if (ef->progtab[i].name == NULL)
				continue;
#if 0
			if (!strcmp(ef->progtab[i].name, "set_pcpu"))
				dpcpu_free(ef->progtab[i].addr,
				    ef->progtab[i].size);
#ifdef VIMAGE
			else if (!strcmp(ef->progtab[i].name, VNET_SETNAME))
				vnet_data_free(ef->progtab[i].addr,
				    ef->progtab[i].size);
#endif
#endif
		}
	}
	if (ef->preloaded) {
		if (ef->reltab)
			kfree(ef->reltab, M_LINKER);
		if (ef->relatab)
			kfree(ef->relatab, M_LINKER);
		if (ef->progtab)
			kfree(ef->progtab, M_LINKER);
		if (ef->ctftab)
			kfree(ef->ctftab, M_LINKER);
		if (ef->ctfoff)
			kfree(ef->ctfoff, M_LINKER);
		if (ef->typoff)
			kfree(ef->typoff, M_LINKER);
		if (file->filename != NULL)
			preload_delete_name(file->filename);
		kfree(ef, M_LINKER);
		/* XXX reclaim module memory? */
		return;
	}

	for (i = 0; i < ef->nreltab; i++)
		if (ef->reltab[i].rel)
			kfree(ef->reltab[i].rel, M_LINKER);
	for (i = 0; i < ef->nrelatab; i++)
		if (ef->relatab[i].rela)
			kfree(ef->relatab[i].rela, M_LINKER);
	if (ef->reltab)
		kfree(ef->reltab, M_LINKER);
	if (ef->relatab)
		kfree(ef->relatab, M_LINKER);
	if (ef->progtab)
		kfree(ef->progtab, M_LINKER);

	if (ef->object) {
#if defined(__x86_64__) && defined(_KERNEL_VIRTUAL)
		vkernel_module_memory_free((vm_offset_t)ef->address, ef->bytes);
#else
		vm_map_remove(&kernel_map, (vm_offset_t) ef->address,
		    (vm_offset_t) ef->address +
		    (ef->object->size << PAGE_SHIFT));
#endif
		vm_object_deallocate(ef->object);
		ef->object = NULL;
	}
	if (ef->e_shdr)
		kfree(ef->e_shdr, M_LINKER);
	if (ef->ddbsymtab)
		kfree(ef->ddbsymtab, M_LINKER);
	if (ef->ddbstrtab)
		kfree(ef->ddbstrtab, M_LINKER);
	if (ef->shstrtab)
		kfree(ef->shstrtab, M_LINKER);
	if (ef->ctftab)
		kfree(ef->ctftab, M_LINKER);
	if (ef->ctfoff)
		kfree(ef->ctfoff, M_LINKER);
	if (ef->typoff)
		kfree(ef->typoff, M_LINKER);
	kfree(ef, M_LINKER);
}
Esempio n. 24
0
vm_object_t
cdev_pager_allocate(void *handle, enum obj_type tp, struct cdev_pager_ops *ops,
    vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred)
{
	vm_object_t object, object1;
	vm_pindex_t pindex;
	u_short color;

	if (tp != OBJT_DEVICE && tp != OBJT_MGTDEVICE)
		return (NULL);

	/*
	 * Offset should be page aligned.
	 */
	if (foff & PAGE_MASK)
		return (NULL);

	size = round_page(size);
	pindex = OFF_TO_IDX(foff + size);

	if (ops->cdev_pg_ctor(handle, size, prot, foff, cred, &color) != 0)
		return (NULL);
	mtx_lock(&dev_pager_mtx);

	/*
	 * Look up pager, creating as necessary.
	 */
	object1 = NULL;
	object = vm_pager_object_lookup(&dev_pager_object_list, handle);
	if (object == NULL) {
		/*
		 * Allocate object and associate it with the pager.  Initialize
		 * the object's pg_color based upon the physical address of the
		 * device's memory.
		 */
		mtx_unlock(&dev_pager_mtx);
		object1 = vm_object_allocate(tp, pindex);
		object1->flags |= OBJ_COLORED;
		object1->pg_color = color;
		object1->handle = handle;
		object1->un_pager.devp.ops = ops;
		object1->un_pager.devp.dev = handle;
		TAILQ_INIT(&object1->un_pager.devp.devp_pglist);
		mtx_lock(&dev_pager_mtx);
		object = vm_pager_object_lookup(&dev_pager_object_list, handle);
		if (object != NULL) {
			/*
			 * We raced with other thread while allocating object.
			 */
			if (pindex > object->size)
				object->size = pindex;
		} else {
			object = object1;
			object1 = NULL;
			object->handle = handle;
			TAILQ_INSERT_TAIL(&dev_pager_object_list, object,
			    pager_object_list);
			KASSERT(object->type == tp,
		("Inconsistent device pager type %p %d", object, tp));
		}
	} else {
		if (pindex > object->size)
			object->size = pindex;
	}
	mtx_unlock(&dev_pager_mtx);
	if (object1 != NULL) {
		object1->handle = object1;
		mtx_lock(&dev_pager_mtx);
		TAILQ_INSERT_TAIL(&dev_pager_object_list, object1,
		    pager_object_list);
		mtx_unlock(&dev_pager_mtx);
		vm_object_deallocate(object1);
	}
	return (object);
}
Esempio n. 25
0
int
acpi_sleep_machdep(struct acpi_softc *sc, int state)
{
	ACPI_STATUS		status;
	vm_offset_t		oldphys;
	struct pmap		*pm;
	vm_page_t		page;
	static vm_page_t	opage = NULL;
	int			ret = 0;
	int			pteobj_allocated = 0;
	u_long			ef;
	struct proc		*p;

	if (sc->acpi_wakeaddr == 0) {
		return (0);
	}

	AcpiSetFirmwareWakingVector(sc->acpi_wakephys);

	ef = read_eflags();
	disable_intr();

	/* Create Identity Mapping */
	if ((p = curproc) == NULL)
		p = &proc0;
	pm = vmspace_pmap(p->p_vmspace);
	if (pm->pm_pteobj == NULL) {
		pm->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, PTDPTDI + 1);
		pteobj_allocated = 1;
	}

	oldphys = pmap_extract(pm, sc->acpi_wakephys);
	if (oldphys) {
		opage = PHYS_TO_VM_PAGE(oldphys);
	}
	page = PHYS_TO_VM_PAGE(sc->acpi_wakephys);
	pmap_enter(pm, sc->acpi_wakephys, page,
		   VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE, 1);

	ret_addr = 0;
	if (acpi_savecpu()) {
		/* Execute Sleep */
		p_gdt = (struct region_descriptor *)(sc->acpi_wakeaddr + physical_gdt);
		p_gdt->rd_limit = r_gdt.rd_limit;
		p_gdt->rd_base = vtophys(r_gdt.rd_base);

		WAKECODE_FIXUP(physical_esp, u_int32_t, vtophys(r_esp));
		WAKECODE_FIXUP(previous_cr0, u_int32_t, r_cr0);
		WAKECODE_FIXUP(previous_cr2, u_int32_t, r_cr2);
		WAKECODE_FIXUP(previous_cr3, u_int32_t, r_cr3);
		WAKECODE_FIXUP(previous_cr4, u_int32_t, r_cr4);

		WAKECODE_FIXUP(previous_tr,  u_int16_t, r_tr);
		WAKECODE_BCOPY(previous_gdt, struct region_descriptor, r_gdt);
		WAKECODE_FIXUP(previous_ldt, u_int16_t, r_ldt);
		WAKECODE_BCOPY(previous_idt, struct region_descriptor, r_idt);

		WAKECODE_FIXUP(where_to_recover, void, acpi_restorecpu);

		WAKECODE_FIXUP(previous_ds,  u_int16_t, r_ds);
		WAKECODE_FIXUP(previous_es,  u_int16_t, r_es);
		WAKECODE_FIXUP(previous_fs,  u_int16_t, r_fs);
		WAKECODE_FIXUP(previous_gs,  u_int16_t, r_gs);
		WAKECODE_FIXUP(previous_ss,  u_int16_t, r_ss);

		if (acpi_get_verbose(sc)) {
			acpi_printcpu();
		}

		wbinvd(); 

		if (state == ACPI_STATE_S4 && sc->acpi_s4bios) {
			status = AcpiEnterSleepStateS4Bios();
		} else {
			status = AcpiEnterSleepState(state);
		}

		if (status != AE_OK) {
			device_printf(sc->acpi_dev,
				"AcpiEnterSleepState failed - %s\n",
				AcpiFormatException(status));
			ret = -1;
			goto out;
		}

		for (;;) ;
	} else {
		/* Execute Wakeup */
#if 0
		initializecpu();
#endif
		icu_reinit();

		if (acpi_get_verbose(sc)) {
			acpi_savecpu();
			acpi_printcpu();
		}
	}

out:
	vm_page_lock_queues();
	pmap_remove(pm, sc->acpi_wakephys, sc->acpi_wakephys + PAGE_SIZE);
	vm_page_unlock_queues();
	if (opage) {
		pmap_enter(pm, sc->acpi_wakephys, page,
			   VM_PROT_READ | VM_PROT_WRITE, 0);
	}

	if (pteobj_allocated) {
		vm_object_deallocate(pm->pm_pteobj);
		pm->pm_pteobj = NULL;
	}

	write_eflags(ef);

	return (ret);
}
Esempio n. 26
0
/*
 * Internal version of mmap.
 * Currently used by mmap, exec, and sys5 shared memory.
 * Handle is either a vnode pointer or NULL for MAP_ANON.
 * 
 * No requirements
 */
int
vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
	vm_prot_t maxprot, int flags, void *handle, vm_ooffset_t foff)
{
	boolean_t fitit;
	vm_object_t object;
	vm_offset_t eaddr;
	vm_size_t   esize;
	vm_size_t   align;
	int (*uksmap)(cdev_t dev, vm_page_t fake);
	struct vnode *vp;
	struct thread *td = curthread;
	struct proc *p;
	int rv = KERN_SUCCESS;
	off_t objsize;
	int docow;
	int error;

	if (size == 0)
		return (0);

	objsize = round_page(size);
	if (objsize < size)
		return (EINVAL);
	size = objsize;

	lwkt_gettoken(&map->token);
	
	/*
	 * XXX messy code, fixme
	 *
	 * NOTE: Overflow checks require discrete statements or GCC4
	 * will optimize it out.
	 */
	if ((p = curproc) != NULL && map == &p->p_vmspace->vm_map) {
		esize = map->size + size;	/* workaround gcc4 opt */
		if (esize < map->size ||
		    esize > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
			lwkt_reltoken(&map->token);
			return(ENOMEM);
		}
	}

	/*
	 * We currently can only deal with page aligned file offsets.
	 * The check is here rather than in the syscall because the
	 * kernel calls this function internally for other mmaping
	 * operations (such as in exec) and non-aligned offsets will
	 * cause pmap inconsistencies...so we want to be sure to
	 * disallow this in all cases.
	 *
	 * NOTE: Overflow checks require discrete statements or GCC4
	 * will optimize it out.
	 */
	if (foff & PAGE_MASK) {
		lwkt_reltoken(&map->token);
		return (EINVAL);
	}

	/*
	 * Handle alignment.  For large memory maps it is possible
	 * that the MMU can optimize the page table so align anything
	 * that is a multiple of SEG_SIZE to SEG_SIZE.
	 *
	 * Also align any large mapping (bigger than 16x SG_SIZE) to a
	 * SEG_SIZE address boundary.
	 */
	if (flags & MAP_SIZEALIGN) {
		align = size;
		if ((align ^ (align - 1)) != (align << 1) - 1) {
			lwkt_reltoken(&map->token);
			return (EINVAL);
		}
	} else if ((flags & MAP_FIXED) == 0 &&
		   ((size & SEG_MASK) == 0 || size > SEG_SIZE * 16)) {
		align = SEG_SIZE;
	} else {
		align = PAGE_SIZE;
	}

	if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
		fitit = TRUE;
		*addr = round_page(*addr);
	} else {
		if (*addr != trunc_page(*addr)) {
			lwkt_reltoken(&map->token);
			return (EINVAL);
		}
		eaddr = *addr + size;
		if (eaddr < *addr) {
			lwkt_reltoken(&map->token);
			return (EINVAL);
		}
		fitit = FALSE;
		if ((flags & MAP_TRYFIXED) == 0)
			vm_map_remove(map, *addr, *addr + size);
	}

	uksmap = NULL;

	/*
	 * Lookup/allocate object.
	 */
	if (flags & MAP_ANON) {
		/*
		 * Unnamed anonymous regions always start at 0.
		 */
		if (handle) {
			/*
			 * Default memory object
			 */
			object = default_pager_alloc(handle, objsize,
						     prot, foff);
			if (object == NULL) {
				lwkt_reltoken(&map->token);
				return(ENOMEM);
			}
			docow = MAP_PREFAULT_PARTIAL;
		} else {
			/*
			 * Implicit single instance of a default memory
			 * object, so we don't need a VM object yet.
			 */
			foff = 0;
			object = NULL;
			docow = 0;
		}
		vp = NULL;
	} else {
		vp = (struct vnode *)handle;

		/*
		 * Non-anonymous mappings of VCHR (aka not /dev/zero)
		 * cannot specify MAP_STACK or MAP_VPAGETABLE.
		 */
		if (vp->v_type == VCHR) {
			if (flags & (MAP_STACK | MAP_VPAGETABLE)) {
				lwkt_reltoken(&map->token);
				return(EINVAL);
			}
		}

		if (vp->v_type == VCHR && vp->v_rdev->si_ops->d_uksmap) {
			/*
			 * Device mappings without a VM object, typically
			 * sharing permanently allocated kernel memory or
			 * process-context-specific (per-process) data.
			 *
			 * Force them to be shared.
			 */
			uksmap = vp->v_rdev->si_ops->d_uksmap;
			object = NULL;
			docow = MAP_PREFAULT_PARTIAL;
			flags &= ~(MAP_PRIVATE|MAP_COPY);
			flags |= MAP_SHARED;
		} else if (vp->v_type == VCHR) {
			/*
			 * Device mappings (device size unknown?).
			 * Force them to be shared.
			 */
			error = dev_dmmap_single(vp->v_rdev, &foff, objsize,
						&object, prot, NULL);

			if (error == ENODEV) {
				handle = (void *)(intptr_t)vp->v_rdev;
				object = dev_pager_alloc(handle, objsize, prot, foff);
				if (object == NULL) {
					lwkt_reltoken(&map->token);
					return(EINVAL);
				}
			} else if (error) {
				lwkt_reltoken(&map->token);
				return(error);
			}

			docow = MAP_PREFAULT_PARTIAL;
			flags &= ~(MAP_PRIVATE|MAP_COPY);
			flags |= MAP_SHARED;
		} else {
			/*
			 * Regular file mapping (typically).  The attribute
			 * check is for the link count test only.  mmapable
			 * vnodes must already have a VM object assigned.
			 */
			struct vattr vat;
			int error;

			error = VOP_GETATTR(vp, &vat);
			if (error) {
				lwkt_reltoken(&map->token);
				return (error);
			}
			docow = MAP_PREFAULT_PARTIAL;
			object = vnode_pager_reference(vp);
			if (object == NULL && vp->v_type == VREG) {
				lwkt_reltoken(&map->token);
				kprintf("Warning: cannot mmap vnode %p, no "
					"object\n", vp);
				return(EINVAL);
			}

			/*
			 * If it is a regular file without any references
			 * we do not need to sync it.
			 */
			if (vp->v_type == VREG && vat.va_nlink == 0) {
				flags |= MAP_NOSYNC;
			}
		}
	}

	/*
	 * Deal with the adjusted flags
	 */
	if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
		docow |= MAP_COPY_ON_WRITE;
	if (flags & MAP_NOSYNC)
		docow |= MAP_DISABLE_SYNCER;
	if (flags & MAP_NOCORE)
		docow |= MAP_DISABLE_COREDUMP;

#if defined(VM_PROT_READ_IS_EXEC)
	if (prot & VM_PROT_READ)
		prot |= VM_PROT_EXECUTE;

	if (maxprot & VM_PROT_READ)
		maxprot |= VM_PROT_EXECUTE;
#endif

	/*
	 * This may place the area in its own page directory if (size) is
	 * large enough, otherwise it typically returns its argument.
	 *
	 * (object can be NULL)
	 */
	if (fitit) {
		*addr = pmap_addr_hint(object, *addr, size);
	}

	/*
	 * Stack mappings need special attention.
	 *
	 * Mappings that use virtual page tables will default to storing
	 * the page table at offset 0.
	 */
	if (uksmap) {
		rv = vm_map_find(map, uksmap, vp->v_rdev,
				 foff, addr, size,
				 align,
				 fitit, VM_MAPTYPE_UKSMAP,
				 prot, maxprot, docow);
	} else if (flags & MAP_STACK) {
		rv = vm_map_stack(map, *addr, size, flags,
				  prot, maxprot, docow);
	} else if (flags & MAP_VPAGETABLE) {
		rv = vm_map_find(map, object, NULL,
				 foff, addr, size,
				 align,
				 fitit, VM_MAPTYPE_VPAGETABLE,
				 prot, maxprot, docow);
	} else {
		rv = vm_map_find(map, object, NULL,
				 foff, addr, size,
				 align,
				 fitit, VM_MAPTYPE_NORMAL,
				 prot, maxprot, docow);
	}

	if (rv != KERN_SUCCESS) {
		/*
		 * Lose the object reference. Will destroy the
		 * object if it's an unnamed anonymous mapping
		 * or named anonymous without other references.
		 *
		 * (NOTE: object can be NULL)
		 */
		vm_object_deallocate(object);
		goto out;
	}

	/*
	 * Shared memory is also shared with children.
	 */
	if (flags & (MAP_SHARED|MAP_INHERIT)) {
		rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE);
		if (rv != KERN_SUCCESS) {
			vm_map_remove(map, *addr, *addr + size);
			goto out;
		}
	}

	/* If a process has marked all future mappings for wiring, do so */
	if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE))
		vm_map_unwire(map, *addr, *addr + size, FALSE);

	/*
	 * Set the access time on the vnode
	 */
	if (vp != NULL)
		vn_mark_atime(vp, td);
out:
	lwkt_reltoken(&map->token);
	
	switch (rv) {
	case KERN_SUCCESS:
		return (0);
	case KERN_INVALID_ADDRESS:
	case KERN_NO_SPACE:
		return (ENOMEM);
	case KERN_PROTECTION_FAILURE:
		return (EACCES);
	default:
		return (EINVAL);
	}
}
Esempio n. 27
0
int
vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
    int fault_flags, vm_page_t *m_hold)
{
	vm_prot_t prot;
	long ahead, behind;
	int alloc_req, era, faultcount, nera, reqpage, result;
	boolean_t growstack, is_first_object_locked, wired;
	int map_generation;
	vm_object_t next_object;
	vm_page_t marray[VM_FAULT_READ_MAX];
	int hardfault;
	struct faultstate fs;
	struct vnode *vp;
	int locked, error;

	hardfault = 0;
	growstack = TRUE;
	PCPU_INC(cnt.v_vm_faults);
	fs.vp = NULL;
	faultcount = reqpage = 0;

RetryFault:;

	/*
	 * Find the backing store object and offset into it to begin the
	 * search.
	 */
	fs.map = map;
	result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry,
	    &fs.first_object, &fs.first_pindex, &prot, &wired);
	if (result != KERN_SUCCESS) {
		if (growstack && result == KERN_INVALID_ADDRESS &&
		    map != kernel_map) {
			result = vm_map_growstack(curproc, vaddr);
			if (result != KERN_SUCCESS)
				return (KERN_FAILURE);
			growstack = FALSE;
			goto RetryFault;
		}
		return (result);
	}

	map_generation = fs.map->timestamp;

	if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
		panic("vm_fault: fault on nofault entry, addr: %lx",
		    (u_long)vaddr);
	}

	/*
	 * Make a reference to this object to prevent its disposal while we
	 * are messing with it.  Once we have the reference, the map is free
	 * to be diddled.  Since objects reference their shadows (and copies),
	 * they will stay around as well.
	 *
	 * Bump the paging-in-progress count to prevent size changes (e.g. 
	 * truncation operations) during I/O.  This must be done after
	 * obtaining the vnode lock in order to avoid possible deadlocks.
	 */
	VM_OBJECT_WLOCK(fs.first_object);
	vm_object_reference_locked(fs.first_object);
	vm_object_pip_add(fs.first_object, 1);

	fs.lookup_still_valid = TRUE;

	if (wired)
		fault_type = prot | (fault_type & VM_PROT_COPY);

	fs.first_m = NULL;

	/*
	 * Search for the page at object/offset.
	 */
	fs.object = fs.first_object;
	fs.pindex = fs.first_pindex;
	while (TRUE) {
		/*
		 * If the object is dead, we stop here
		 */
		if (fs.object->flags & OBJ_DEAD) {
			unlock_and_deallocate(&fs);
			return (KERN_PROTECTION_FAILURE);
		}

		/*
		 * See if page is resident
		 */
		fs.m = vm_page_lookup(fs.object, fs.pindex);
		if (fs.m != NULL) {
			/* 
			 * check for page-based copy on write.
			 * We check fs.object == fs.first_object so
			 * as to ensure the legacy COW mechanism is
			 * used when the page in question is part of
			 * a shadow object.  Otherwise, vm_page_cowfault()
			 * removes the page from the backing object, 
			 * which is not what we want.
			 */
			vm_page_lock(fs.m);
			if ((fs.m->cow) && 
			    (fault_type & VM_PROT_WRITE) &&
			    (fs.object == fs.first_object)) {
				vm_page_cowfault(fs.m);
				unlock_and_deallocate(&fs);
				goto RetryFault;
			}

			/*
			 * Wait/Retry if the page is busy.  We have to do this
			 * if the page is busy via either VPO_BUSY or 
			 * vm_page_t->busy because the vm_pager may be using
			 * vm_page_t->busy for pageouts ( and even pageins if
			 * it is the vnode pager ), and we could end up trying
			 * to pagein and pageout the same page simultaneously.
			 *
			 * We can theoretically allow the busy case on a read
			 * fault if the page is marked valid, but since such
			 * pages are typically already pmap'd, putting that
			 * special case in might be more effort then it is 
			 * worth.  We cannot under any circumstances mess
			 * around with a vm_page_t->busy page except, perhaps,
			 * to pmap it.
			 */
			if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) {
				/*
				 * Reference the page before unlocking and
				 * sleeping so that the page daemon is less
				 * likely to reclaim it. 
				 */
				vm_page_aflag_set(fs.m, PGA_REFERENCED);
				vm_page_unlock(fs.m);
				if (fs.object != fs.first_object) {
					if (!VM_OBJECT_TRYWLOCK(
					    fs.first_object)) {
						VM_OBJECT_WUNLOCK(fs.object);
						VM_OBJECT_WLOCK(fs.first_object);
						VM_OBJECT_WLOCK(fs.object);
					}
					vm_page_lock(fs.first_m);
					vm_page_free(fs.first_m);
					vm_page_unlock(fs.first_m);
					vm_object_pip_wakeup(fs.first_object);
					VM_OBJECT_WUNLOCK(fs.first_object);
					fs.first_m = NULL;
				}
				unlock_map(&fs);
				if (fs.m == vm_page_lookup(fs.object,
				    fs.pindex)) {
					vm_page_sleep_if_busy(fs.m, TRUE,
					    "vmpfw");
				}
				vm_object_pip_wakeup(fs.object);
				VM_OBJECT_WUNLOCK(fs.object);
				PCPU_INC(cnt.v_intrans);
				vm_object_deallocate(fs.first_object);
				goto RetryFault;
			}
			vm_page_remque(fs.m);
			vm_page_unlock(fs.m);

			/*
			 * Mark page busy for other processes, and the 
			 * pagedaemon.  If it still isn't completely valid
			 * (readable), jump to readrest, else break-out ( we
			 * found the page ).
			 */
			vm_page_busy(fs.m);
			if (fs.m->valid != VM_PAGE_BITS_ALL)
				goto readrest;
			break;
		}

		/*
		 * Page is not resident, If this is the search termination
		 * or the pager might contain the page, allocate a new page.
		 */
		if (TRYPAGER || fs.object == fs.first_object) {
			if (fs.pindex >= fs.object->size) {
				unlock_and_deallocate(&fs);
				return (KERN_PROTECTION_FAILURE);
			}

			/*
			 * Allocate a new page for this object/offset pair.
			 *
			 * Unlocked read of the p_flag is harmless. At
			 * worst, the P_KILLED might be not observed
			 * there, and allocation can fail, causing
			 * restart and new reading of the p_flag.
			 */
			fs.m = NULL;
			if (!vm_page_count_severe() || P_KILLED(curproc)) {
#if VM_NRESERVLEVEL > 0
				if ((fs.object->flags & OBJ_COLORED) == 0) {
					fs.object->flags |= OBJ_COLORED;
					fs.object->pg_color = atop(vaddr) -
					    fs.pindex;
				}
#endif
				alloc_req = P_KILLED(curproc) ?
				    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL;
				if (fs.object->type != OBJT_VNODE &&
				    fs.object->backing_object == NULL)
					alloc_req |= VM_ALLOC_ZERO;
				fs.m = vm_page_alloc(fs.object, fs.pindex,
				    alloc_req);
			}
			if (fs.m == NULL) {
				unlock_and_deallocate(&fs);
				VM_WAITPFAULT;
				goto RetryFault;
			} else if (fs.m->valid == VM_PAGE_BITS_ALL)
				break;
		}

readrest:
		/*
		 * We have found a valid page or we have allocated a new page.
		 * The page thus may not be valid or may not be entirely 
		 * valid.
		 *
		 * Attempt to fault-in the page if there is a chance that the
		 * pager has it, and potentially fault in additional pages
		 * at the same time.
		 */
		if (TRYPAGER) {
			int rv;
			u_char behavior = vm_map_entry_behavior(fs.entry);

			if (behavior == MAP_ENTRY_BEHAV_RANDOM ||
			    P_KILLED(curproc)) {
				behind = 0;
				ahead = 0;
			} else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
				behind = 0;
				ahead = atop(fs.entry->end - vaddr) - 1;
				if (ahead > VM_FAULT_READ_AHEAD_MAX)
					ahead = VM_FAULT_READ_AHEAD_MAX;
				if (fs.pindex == fs.entry->next_read)
					vm_fault_cache_behind(&fs,
					    VM_FAULT_READ_MAX);
			} else {
				/*
				 * If this is a sequential page fault, then
				 * arithmetically increase the number of pages
				 * in the read-ahead window.  Otherwise, reset
				 * the read-ahead window to its smallest size.
				 */
				behind = atop(vaddr - fs.entry->start);
				if (behind > VM_FAULT_READ_BEHIND)
					behind = VM_FAULT_READ_BEHIND;
				ahead = atop(fs.entry->end - vaddr) - 1;
				era = fs.entry->read_ahead;
				if (fs.pindex == fs.entry->next_read) {
					nera = era + behind;
					if (nera > VM_FAULT_READ_AHEAD_MAX)
						nera = VM_FAULT_READ_AHEAD_MAX;
					behind = 0;
					if (ahead > nera)
						ahead = nera;
					if (era == VM_FAULT_READ_AHEAD_MAX)
						vm_fault_cache_behind(&fs,
						    VM_FAULT_CACHE_BEHIND);
				} else if (ahead > VM_FAULT_READ_AHEAD_MIN)
					ahead = VM_FAULT_READ_AHEAD_MIN;
				if (era != ahead)
					fs.entry->read_ahead = ahead;
			}

			/*
			 * Call the pager to retrieve the data, if any, after
			 * releasing the lock on the map.  We hold a ref on
			 * fs.object and the pages are VPO_BUSY'd.
			 */
			unlock_map(&fs);

			if (fs.object->type == OBJT_VNODE) {
				vp = fs.object->handle;
				if (vp == fs.vp)
					goto vnode_locked;
				else if (fs.vp != NULL) {
					vput(fs.vp);
					fs.vp = NULL;
				}
				locked = VOP_ISLOCKED(vp);

				if (locked != LK_EXCLUSIVE)
					locked = LK_SHARED;
				/* Do not sleep for vnode lock while fs.m is busy */
				error = vget(vp, locked | LK_CANRECURSE |
				    LK_NOWAIT, curthread);
				if (error != 0) {
					vhold(vp);
					release_page(&fs);
					unlock_and_deallocate(&fs);
					error = vget(vp, locked | LK_RETRY |
					    LK_CANRECURSE, curthread);
					vdrop(vp);
					fs.vp = vp;
					KASSERT(error == 0,
					    ("vm_fault: vget failed"));
					goto RetryFault;
				}
				fs.vp = vp;
			}
vnode_locked:
			KASSERT(fs.vp == NULL || !fs.map->system_map,
			    ("vm_fault: vnode-backed object mapped by system map"));

			/*
			 * now we find out if any other pages should be paged
			 * in at this time this routine checks to see if the
			 * pages surrounding this fault reside in the same
			 * object as the page for this fault.  If they do,
			 * then they are faulted in also into the object.  The
			 * array "marray" returned contains an array of
			 * vm_page_t structs where one of them is the
			 * vm_page_t passed to the routine.  The reqpage
			 * return value is the index into the marray for the
			 * vm_page_t passed to the routine.
			 *
			 * fs.m plus the additional pages are VPO_BUSY'd.
			 */
			faultcount = vm_fault_additional_pages(
			    fs.m, behind, ahead, marray, &reqpage);

			rv = faultcount ?
			    vm_pager_get_pages(fs.object, marray, faultcount,
				reqpage) : VM_PAGER_FAIL;

			if (rv == VM_PAGER_OK) {
				/*
				 * Found the page. Leave it busy while we play
				 * with it.
				 */

				/*
				 * Relookup in case pager changed page. Pager
				 * is responsible for disposition of old page
				 * if moved.
				 */
				fs.m = vm_page_lookup(fs.object, fs.pindex);
				if (!fs.m) {
					unlock_and_deallocate(&fs);
					goto RetryFault;
				}

				hardfault++;
				break; /* break to PAGE HAS BEEN FOUND */
			}
			/*
			 * Remove the bogus page (which does not exist at this
			 * object/offset); before doing so, we must get back
			 * our object lock to preserve our invariant.
			 *
			 * Also wake up any other process that may want to bring
			 * in this page.
			 *
			 * If this is the top-level object, we must leave the
			 * busy page to prevent another process from rushing
			 * past us, and inserting the page in that object at
			 * the same time that we are.
			 */
			if (rv == VM_PAGER_ERROR)
				printf("vm_fault: pager read error, pid %d (%s)\n",
				    curproc->p_pid, curproc->p_comm);
			/*
			 * Data outside the range of the pager or an I/O error
			 */
			/*
			 * XXX - the check for kernel_map is a kludge to work
			 * around having the machine panic on a kernel space
			 * fault w/ I/O error.
			 */
			if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) ||
				(rv == VM_PAGER_BAD)) {
				vm_page_lock(fs.m);
				vm_page_free(fs.m);
				vm_page_unlock(fs.m);
				fs.m = NULL;
				unlock_and_deallocate(&fs);
				return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE);
			}
			if (fs.object != fs.first_object) {
				vm_page_lock(fs.m);
				vm_page_free(fs.m);
				vm_page_unlock(fs.m);
				fs.m = NULL;
				/*
				 * XXX - we cannot just fall out at this
				 * point, m has been freed and is invalid!
				 */
			}
		}

		/*
		 * We get here if the object has default pager (or unwiring) 
		 * or the pager doesn't have the page.
		 */
		if (fs.object == fs.first_object)
			fs.first_m = fs.m;

		/*
		 * Move on to the next object.  Lock the next object before
		 * unlocking the current one.
		 */
		fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset);
		next_object = fs.object->backing_object;
		if (next_object == NULL) {
			/*
			 * If there's no object left, fill the page in the top
			 * object with zeros.
			 */
			if (fs.object != fs.first_object) {
				vm_object_pip_wakeup(fs.object);
				VM_OBJECT_WUNLOCK(fs.object);

				fs.object = fs.first_object;
				fs.pindex = fs.first_pindex;
				fs.m = fs.first_m;
				VM_OBJECT_WLOCK(fs.object);
			}
			fs.first_m = NULL;

			/*
			 * Zero the page if necessary and mark it valid.
			 */
			if ((fs.m->flags & PG_ZERO) == 0) {
				pmap_zero_page(fs.m);
			} else {
				PCPU_INC(cnt.v_ozfod);
			}
			PCPU_INC(cnt.v_zfod);
			fs.m->valid = VM_PAGE_BITS_ALL;
			break;	/* break to PAGE HAS BEEN FOUND */
		} else {
			KASSERT(fs.object != next_object,
			    ("object loop %p", next_object));
			VM_OBJECT_WLOCK(next_object);
			vm_object_pip_add(next_object, 1);
			if (fs.object != fs.first_object)
				vm_object_pip_wakeup(fs.object);
			VM_OBJECT_WUNLOCK(fs.object);
			fs.object = next_object;
		}
	}

	KASSERT((fs.m->oflags & VPO_BUSY) != 0,
	    ("vm_fault: not busy after main loop"));

	/*
	 * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock
	 * is held.]
	 */

	/*
	 * If the page is being written, but isn't already owned by the
	 * top-level object, we have to copy it into a new page owned by the
	 * top-level object.
	 */
	if (fs.object != fs.first_object) {
		/*
		 * We only really need to copy if we want to write it.
		 */
		if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
			/*
			 * This allows pages to be virtually copied from a 
			 * backing_object into the first_object, where the 
			 * backing object has no other refs to it, and cannot
			 * gain any more refs.  Instead of a bcopy, we just 
			 * move the page from the backing object to the 
			 * first object.  Note that we must mark the page 
			 * dirty in the first object so that it will go out 
			 * to swap when needed.
			 */
			is_first_object_locked = FALSE;
			if (
				/*
				 * Only one shadow object
				 */
				(fs.object->shadow_count == 1) &&
				/*
				 * No COW refs, except us
				 */
				(fs.object->ref_count == 1) &&
				/*
				 * No one else can look this object up
				 */
				(fs.object->handle == NULL) &&
				/*
				 * No other ways to look the object up
				 */
				((fs.object->type == OBJT_DEFAULT) ||
				 (fs.object->type == OBJT_SWAP)) &&
			    (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) &&
				/*
				 * We don't chase down the shadow chain
				 */
			    fs.object == fs.first_object->backing_object) {
				/*
				 * get rid of the unnecessary page
				 */
				vm_page_lock(fs.first_m);
				vm_page_free(fs.first_m);
				vm_page_unlock(fs.first_m);
				/*
				 * grab the page and put it into the 
				 * process'es object.  The page is 
				 * automatically made dirty.
				 */
				vm_page_lock(fs.m);
				vm_page_rename(fs.m, fs.first_object, fs.first_pindex);
				vm_page_unlock(fs.m);
				vm_page_busy(fs.m);
				fs.first_m = fs.m;
				fs.m = NULL;
				PCPU_INC(cnt.v_cow_optim);
			} else {
				/*
				 * Oh, well, lets copy it.
				 */
				pmap_copy_page(fs.m, fs.first_m);
				fs.first_m->valid = VM_PAGE_BITS_ALL;
				if (wired && (fault_flags &
				    VM_FAULT_CHANGE_WIRING) == 0) {
					vm_page_lock(fs.first_m);
					vm_page_wire(fs.first_m);
					vm_page_unlock(fs.first_m);
					
					vm_page_lock(fs.m);
					vm_page_unwire(fs.m, FALSE);
					vm_page_unlock(fs.m);
				}
				/*
				 * We no longer need the old page or object.
				 */
				release_page(&fs);
			}
			/*
			 * fs.object != fs.first_object due to above 
			 * conditional
			 */
			vm_object_pip_wakeup(fs.object);
			VM_OBJECT_WUNLOCK(fs.object);
			/*
			 * Only use the new page below...
			 */
			fs.object = fs.first_object;
			fs.pindex = fs.first_pindex;
			fs.m = fs.first_m;
			if (!is_first_object_locked)
				VM_OBJECT_WLOCK(fs.object);
			PCPU_INC(cnt.v_cow_faults);
			curthread->td_cow++;
		} else {
			prot &= ~VM_PROT_WRITE;
		}
	}

	/*
	 * We must verify that the maps have not changed since our last
	 * lookup.
	 */
	if (!fs.lookup_still_valid) {
		vm_object_t retry_object;
		vm_pindex_t retry_pindex;
		vm_prot_t retry_prot;

		if (!vm_map_trylock_read(fs.map)) {
			release_page(&fs);
			unlock_and_deallocate(&fs);
			goto RetryFault;
		}
		fs.lookup_still_valid = TRUE;
		if (fs.map->timestamp != map_generation) {
			result = vm_map_lookup_locked(&fs.map, vaddr, fault_type,
			    &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);

			/*
			 * If we don't need the page any longer, put it on the inactive
			 * list (the easiest thing to do here).  If no one needs it,
			 * pageout will grab it eventually.
			 */
			if (result != KERN_SUCCESS) {
				release_page(&fs);
				unlock_and_deallocate(&fs);

				/*
				 * If retry of map lookup would have blocked then
				 * retry fault from start.
				 */
				if (result == KERN_FAILURE)
					goto RetryFault;
				return (result);
			}
			if ((retry_object != fs.first_object) ||
			    (retry_pindex != fs.first_pindex)) {
				release_page(&fs);
				unlock_and_deallocate(&fs);
				goto RetryFault;
			}

			/*
			 * Check whether the protection has changed or the object has
			 * been copied while we left the map unlocked. Changing from
			 * read to write permission is OK - we leave the page
			 * write-protected, and catch the write fault. Changing from
			 * write to read permission means that we can't mark the page
			 * write-enabled after all.
			 */
			prot &= retry_prot;
		}
	}
	/*
	 * If the page was filled by a pager, update the map entry's
	 * last read offset.  Since the pager does not return the
	 * actual set of pages that it read, this update is based on
	 * the requested set.  Typically, the requested and actual
	 * sets are the same.
	 *
	 * XXX The following assignment modifies the map
	 * without holding a write lock on it.
	 */
	if (hardfault)
		fs.entry->next_read = fs.pindex + faultcount - reqpage;

	if ((prot & VM_PROT_WRITE) != 0 ||
	    (fault_flags & VM_FAULT_DIRTY) != 0) {
		vm_object_set_writeable_dirty(fs.object);

		/*
		 * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
		 * if the page is already dirty to prevent data written with
		 * the expectation of being synced from not being synced.
		 * Likewise if this entry does not request NOSYNC then make
		 * sure the page isn't marked NOSYNC.  Applications sharing
		 * data should use the same flags to avoid ping ponging.
		 */
		if (fs.entry->eflags & MAP_ENTRY_NOSYNC) {
			if (fs.m->dirty == 0)
				fs.m->oflags |= VPO_NOSYNC;
		} else {
			fs.m->oflags &= ~VPO_NOSYNC;
		}

		/*
		 * If the fault is a write, we know that this page is being
		 * written NOW so dirty it explicitly to save on 
		 * pmap_is_modified() calls later.
		 *
		 * Also tell the backing pager, if any, that it should remove
		 * any swap backing since the page is now dirty.
		 */
		if (((fault_type & VM_PROT_WRITE) != 0 &&
		    (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) ||
		    (fault_flags & VM_FAULT_DIRTY) != 0) {
			vm_page_dirty(fs.m);
			vm_pager_page_unswapped(fs.m);
		}
	}

	/*
	 * Page had better still be busy
	 */
	KASSERT(fs.m->oflags & VPO_BUSY,
		("vm_fault: page %p not busy!", fs.m));
	/*
	 * Page must be completely valid or it is not fit to
	 * map into user space.  vm_pager_get_pages() ensures this.
	 */
	KASSERT(fs.m->valid == VM_PAGE_BITS_ALL,
	    ("vm_fault: page %p partially invalid", fs.m));
	VM_OBJECT_WUNLOCK(fs.object);

	/*
	 * Put this page into the physical map.  We had to do the unlock above
	 * because pmap_enter() may sleep.  We don't put the page
	 * back on the active queue until later so that the pageout daemon
	 * won't find it (yet).
	 */
	pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired);
	if ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 && wired == 0)
		vm_fault_prefault(fs.map->pmap, vaddr, fs.entry);
	VM_OBJECT_WLOCK(fs.object);
	vm_page_lock(fs.m);

	/*
	 * If the page is not wired down, then put it where the pageout daemon
	 * can find it.
	 */
	if (fault_flags & VM_FAULT_CHANGE_WIRING) {
		if (wired)
			vm_page_wire(fs.m);
		else
			vm_page_unwire(fs.m, 1);
	} else
		vm_page_activate(fs.m);
	if (m_hold != NULL) {
		*m_hold = fs.m;
		vm_page_hold(fs.m);
	}
	vm_page_unlock(fs.m);
	vm_page_wakeup(fs.m);

	/*
	 * Unlock everything, and return
	 */
	unlock_and_deallocate(&fs);
	if (hardfault) {
		PCPU_INC(cnt.v_io_faults);
		curthread->td_ru.ru_majflt++;
	} else 
		curthread->td_ru.ru_minflt++;

	return (KERN_SUCCESS);
}
Esempio n. 28
0
/*
 * In-kernel implementation of execve().  All arguments are assumed to be
 * userspace pointers from the passed thread.
 */
static int
do_execve(struct thread *td, struct image_args *args, struct mac *mac_p)
{
	struct proc *p = td->td_proc;
	struct nameidata nd;
	struct ucred *oldcred;
	struct uidinfo *euip = NULL;
	register_t *stack_base;
	int error, i;
	struct image_params image_params, *imgp;
	struct vattr attr;
	int (*img_first)(struct image_params *);
	struct pargs *oldargs = NULL, *newargs = NULL;
	struct sigacts *oldsigacts = NULL, *newsigacts = NULL;
#ifdef KTRACE
	struct vnode *tracevp = NULL;
	struct ucred *tracecred = NULL;
#endif
	struct vnode *oldtextvp = NULL, *newtextvp;
	int credential_changing;
	int textset;
#ifdef MAC
	struct label *interpvplabel = NULL;
	int will_transition;
#endif
#ifdef HWPMC_HOOKS
	struct pmckern_procexec pe;
#endif
	static const char fexecv_proc_title[] = "(fexecv)";

	imgp = &image_params;

	/*
	 * Lock the process and set the P_INEXEC flag to indicate that
	 * it should be left alone until we're done here.  This is
	 * necessary to avoid race conditions - e.g. in ptrace() -
	 * that might allow a local user to illicitly obtain elevated
	 * privileges.
	 */
	PROC_LOCK(p);
	KASSERT((p->p_flag & P_INEXEC) == 0,
	    ("%s(): process already has P_INEXEC flag", __func__));
	p->p_flag |= P_INEXEC;
	PROC_UNLOCK(p);

	/*
	 * Initialize part of the common data
	 */
	bzero(imgp, sizeof(*imgp));
	imgp->proc = p;
	imgp->attr = &attr;
	imgp->args = args;
	oldcred = p->p_ucred;

#ifdef MAC
	error = mac_execve_enter(imgp, mac_p);
	if (error)
		goto exec_fail;
#endif

	/*
	 * Translate the file name. namei() returns a vnode pointer
	 *	in ni_vp among other things.
	 *
	 * XXXAUDIT: It would be desirable to also audit the name of the
	 * interpreter if this is an interpreted binary.
	 */
	if (args->fname != NULL) {
		NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
		    | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
	}

	SDT_PROBE1(proc, , , exec, args->fname);

interpret:
	if (args->fname != NULL) {
#ifdef CAPABILITY_MODE
		/*
		 * While capability mode can't reach this point via direct
		 * path arguments to execve(), we also don't allow
		 * interpreters to be used in capability mode (for now).
		 * Catch indirect lookups and return a permissions error.
		 */
		if (IN_CAPABILITY_MODE(td)) {
			error = ECAPMODE;
			goto exec_fail;
		}
#endif
		error = namei(&nd);
		if (error)
			goto exec_fail;

		newtextvp = nd.ni_vp;
		imgp->vp = newtextvp;
	} else {
		AUDIT_ARG_FD(args->fd);
		/*
		 * Descriptors opened only with O_EXEC or O_RDONLY are allowed.
		 */
		error = fgetvp_exec(td, args->fd, &cap_fexecve_rights, &newtextvp);
		if (error)
			goto exec_fail;
		vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY);
		AUDIT_ARG_VNODE1(newtextvp);
		imgp->vp = newtextvp;
	}

	/*
	 * Check file permissions (also 'opens' file)
	 */
	error = exec_check_permissions(imgp);
	if (error)
		goto exec_fail_dealloc;

	imgp->object = imgp->vp->v_object;
	if (imgp->object != NULL)
		vm_object_reference(imgp->object);

	/*
	 * Set VV_TEXT now so no one can write to the executable while we're
	 * activating it.
	 *
	 * Remember if this was set before and unset it in case this is not
	 * actually an executable image.
	 */
	textset = VOP_IS_TEXT(imgp->vp);
	VOP_SET_TEXT(imgp->vp);

	error = exec_map_first_page(imgp);
	if (error)
		goto exec_fail_dealloc;

	imgp->proc->p_osrel = 0;
	imgp->proc->p_fctl0 = 0;

	/*
	 * Implement image setuid/setgid.
	 *
	 * Determine new credentials before attempting image activators
	 * so that it can be used by process_exec handlers to determine
	 * credential/setid changes.
	 *
	 * Don't honor setuid/setgid if the filesystem prohibits it or if
	 * the process is being traced.
	 *
	 * We disable setuid/setgid/etc in capability mode on the basis
	 * that most setugid applications are not written with that
	 * environment in mind, and will therefore almost certainly operate
	 * incorrectly. In principle there's no reason that setugid
	 * applications might not be useful in capability mode, so we may want
	 * to reconsider this conservative design choice in the future.
	 *
	 * XXXMAC: For the time being, use NOSUID to also prohibit
	 * transitions on the file system.
	 */
	credential_changing = 0;
	credential_changing |= (attr.va_mode & S_ISUID) &&
	    oldcred->cr_uid != attr.va_uid;
	credential_changing |= (attr.va_mode & S_ISGID) &&
	    oldcred->cr_gid != attr.va_gid;
#ifdef MAC
	will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
	    interpvplabel, imgp);
	credential_changing |= will_transition;
#endif

	/* Don't inherit PROC_PDEATHSIG_CTL value if setuid/setgid. */
	if (credential_changing)
		imgp->proc->p_pdeathsig = 0;

	if (credential_changing &&
#ifdef CAPABILITY_MODE
	    ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
#endif
	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
	    (p->p_flag & P_TRACED) == 0) {
		imgp->credential_setid = true;
		VOP_UNLOCK(imgp->vp, 0);
		imgp->newcred = crdup(oldcred);
		if (attr.va_mode & S_ISUID) {
			euip = uifind(attr.va_uid);
			change_euid(imgp->newcred, euip);
		}
		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
		if (attr.va_mode & S_ISGID)
			change_egid(imgp->newcred, attr.va_gid);
		/*
		 * Implement correct POSIX saved-id behavior.
		 *
		 * XXXMAC: Note that the current logic will save the
		 * uid and gid if a MAC domain transition occurs, even
		 * though maybe it shouldn't.
		 */
		change_svuid(imgp->newcred, imgp->newcred->cr_uid);
		change_svgid(imgp->newcred, imgp->newcred->cr_gid);
	} else {
		/*
		 * Implement correct POSIX saved-id behavior.
		 *
		 * XXX: It's not clear that the existing behavior is
		 * POSIX-compliant.  A number of sources indicate that the
		 * saved uid/gid should only be updated if the new ruid is
		 * not equal to the old ruid, or the new euid is not equal
		 * to the old euid and the new euid is not equal to the old
		 * ruid.  The FreeBSD code always updates the saved uid/gid.
		 * Also, this code uses the new (replaced) euid and egid as
		 * the source, which may or may not be the right ones to use.
		 */
		if (oldcred->cr_svuid != oldcred->cr_uid ||
		    oldcred->cr_svgid != oldcred->cr_gid) {
			VOP_UNLOCK(imgp->vp, 0);
			imgp->newcred = crdup(oldcred);
			vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
			change_svuid(imgp->newcred, imgp->newcred->cr_uid);
			change_svgid(imgp->newcred, imgp->newcred->cr_gid);
		}
	}
	/* The new credentials are installed into the process later. */

	/*
	 * Do the best to calculate the full path to the image file.
	 */
	if (args->fname != NULL && args->fname[0] == '/')
		imgp->execpath = args->fname;
	else {
		VOP_UNLOCK(imgp->vp, 0);
		if (vn_fullpath(td, imgp->vp, &imgp->execpath,
		    &imgp->freepath) != 0)
			imgp->execpath = args->fname;
		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
	}

	/*
	 *	If the current process has a special image activator it
	 *	wants to try first, call it.   For example, emulating shell
	 *	scripts differently.
	 */
	error = -1;
	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
		error = img_first(imgp);

	/*
	 *	Loop through the list of image activators, calling each one.
	 *	An activator returns -1 if there is no match, 0 on success,
	 *	and an error otherwise.
	 */
	for (i = 0; error == -1 && execsw[i]; ++i) {
		if (execsw[i]->ex_imgact == NULL ||
		    execsw[i]->ex_imgact == img_first) {
			continue;
		}
		error = (*execsw[i]->ex_imgact)(imgp);
	}

	if (error) {
		if (error == -1) {
			if (textset == 0)
				VOP_UNSET_TEXT(imgp->vp);
			error = ENOEXEC;
		}
		goto exec_fail_dealloc;
	}

	/*
	 * Special interpreter operation, cleanup and loop up to try to
	 * activate the interpreter.
	 */
	if (imgp->interpreted) {
		exec_unmap_first_page(imgp);
		/*
		 * VV_TEXT needs to be unset for scripts.  There is a short
		 * period before we determine that something is a script where
		 * VV_TEXT will be set. The vnode lock is held over this
		 * entire period so nothing should illegitimately be blocked.
		 */
		VOP_UNSET_TEXT(imgp->vp);
		/* free name buffer and old vnode */
		if (args->fname != NULL)
			NDFREE(&nd, NDF_ONLY_PNBUF);
#ifdef MAC
		mac_execve_interpreter_enter(newtextvp, &interpvplabel);
#endif
		if (imgp->opened) {
			VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td);
			imgp->opened = 0;
		}
		vput(newtextvp);
		vm_object_deallocate(imgp->object);
		imgp->object = NULL;
		imgp->credential_setid = false;
		if (imgp->newcred != NULL) {
			crfree(imgp->newcred);
			imgp->newcred = NULL;
		}
		imgp->execpath = NULL;
		free(imgp->freepath, M_TEMP);
		imgp->freepath = NULL;
		/* set new name to that of the interpreter */
		NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
		    UIO_SYSSPACE, imgp->interpreter_name, td);
		args->fname = imgp->interpreter_name;
		goto interpret;
	}

	/*
	 * NB: We unlock the vnode here because it is believed that none
	 * of the sv_copyout_strings/sv_fixup operations require the vnode.
	 */
	VOP_UNLOCK(imgp->vp, 0);

	if (disallow_high_osrel &&
	    P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) {
		error = ENOEXEC;
		uprintf("Osrel %d for image %s too high\n", p->p_osrel,
		    imgp->execpath != NULL ? imgp->execpath : "<unresolved>");
		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
		goto exec_fail_dealloc;
	}

	/* ABI enforces the use of Capsicum. Switch into capabilities mode. */
	if (SV_PROC_FLAG(p, SV_CAPSICUM))
		sys_cap_enter(td, NULL);

	/*
	 * Copy out strings (args and env) and initialize stack base.
	 */
	stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);

	/*
	 * Stack setup.
	 */
	error = (*p->p_sysent->sv_fixup)(&stack_base, imgp);
	if (error != 0) {
		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
		goto exec_fail_dealloc;
	}

	if (args->fdp != NULL) {
		/* Install a brand new file descriptor table. */
		fdinstall_remapped(td, args->fdp);
		args->fdp = NULL;
	} else {
		/*
		 * Keep on using the existing file descriptor table. For
		 * security and other reasons, the file descriptor table
		 * cannot be shared after an exec.
		 */
		fdunshare(td);
		/* close files on exec */
		fdcloseexec(td);
	}

	/*
	 * Malloc things before we need locks.
	 */
	i = exec_args_get_begin_envv(imgp->args) - imgp->args->begin_argv;
	/* Cache arguments if they fit inside our allowance */
	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
		newargs = pargs_alloc(i);
		bcopy(imgp->args->begin_argv, newargs->ar_args, i);
	}

	/*
	 * For security and other reasons, signal handlers cannot
	 * be shared after an exec. The new process gets a copy of the old
	 * handlers. In execsigs(), the new process will have its signals
	 * reset.
	 */
	if (sigacts_shared(p->p_sigacts)) {
		oldsigacts = p->p_sigacts;
		newsigacts = sigacts_alloc();
		sigacts_copy(newsigacts, oldsigacts);
	}

	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);

	PROC_LOCK(p);
	if (oldsigacts)
		p->p_sigacts = newsigacts;
	/* Stop profiling */
	stopprofclock(p);

	/* reset caught signals */
	execsigs(p);

	/* name this process - nameiexec(p, ndp) */
	bzero(p->p_comm, sizeof(p->p_comm));
	if (args->fname)
		bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
		    min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
	else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0)
		bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
	bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
#ifdef KTR
	sched_clear_tdname(td);
#endif

	/*
	 * mark as execed, wakeup the process that vforked (if any) and tell
	 * it that it now has its own resources back
	 */
	p->p_flag |= P_EXEC;
	if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0)
		p->p_flag2 &= ~P2_NOTRACE;
	if (p->p_flag & P_PPWAIT) {
		p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
		cv_broadcast(&p->p_pwait);
		/* STOPs are no longer ignored, arrange for AST */
		signotify(td);
	}

	/*
	 * Implement image setuid/setgid installation.
	 */
	if (imgp->credential_setid) {
		/*
		 * Turn off syscall tracing for set-id programs, except for
		 * root.  Record any set-id flags first to make sure that
		 * we do not regain any tracing during a possible block.
		 */
		setsugid(p);

#ifdef KTRACE
		if (p->p_tracecred != NULL &&
		    priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED))
			ktrprocexec(p, &tracecred, &tracevp);
#endif
		/*
		 * Close any file descriptors 0..2 that reference procfs,
		 * then make sure file descriptors 0..2 are in use.
		 *
		 * Both fdsetugidsafety() and fdcheckstd() may call functions
		 * taking sleepable locks, so temporarily drop our locks.
		 */
		PROC_UNLOCK(p);
		VOP_UNLOCK(imgp->vp, 0);
		fdsetugidsafety(td);
		error = fdcheckstd(td);
		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
		if (error != 0)
			goto exec_fail_dealloc;
		PROC_LOCK(p);
#ifdef MAC
		if (will_transition) {
			mac_vnode_execve_transition(oldcred, imgp->newcred,
			    imgp->vp, interpvplabel, imgp);
		}
#endif
	} else {
		if (oldcred->cr_uid == oldcred->cr_ruid &&
		    oldcred->cr_gid == oldcred->cr_rgid)
			p->p_flag &= ~P_SUGID;
	}
	/*
	 * Set the new credentials.
	 */
	if (imgp->newcred != NULL) {
		proc_set_cred(p, imgp->newcred);
		crfree(oldcred);
		oldcred = NULL;
	}

	/*
	 * Store the vp for use in procfs.  This vnode was referenced by namei
	 * or fgetvp_exec.
	 */
	oldtextvp = p->p_textvp;
	p->p_textvp = newtextvp;

#ifdef KDTRACE_HOOKS
	/*
	 * Tell the DTrace fasttrap provider about the exec if it
	 * has declared an interest.
	 */
	if (dtrace_fasttrap_exec)
		dtrace_fasttrap_exec(p);
#endif

	/*
	 * Notify others that we exec'd, and clear the P_INEXEC flag
	 * as we're now a bona fide freshly-execed process.
	 */
	KNOTE_LOCKED(p->p_klist, NOTE_EXEC);
	p->p_flag &= ~P_INEXEC;

	/* clear "fork but no exec" flag, as we _are_ execing */
	p->p_acflag &= ~AFORK;

	/*
	 * Free any previous argument cache and replace it with
	 * the new argument cache, if any.
	 */
	oldargs = p->p_args;
	p->p_args = newargs;
	newargs = NULL;

	PROC_UNLOCK(p);

#ifdef	HWPMC_HOOKS
	/*
	 * Check if system-wide sampling is in effect or if the
	 * current process is using PMCs.  If so, do exec() time
	 * processing.  This processing needs to happen AFTER the
	 * P_INEXEC flag is cleared.
	 */
	if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
		VOP_UNLOCK(imgp->vp, 0);
		pe.pm_credentialschanged = credential_changing;
		pe.pm_entryaddr = imgp->entry_addr;

		PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
	}
#endif

	/* Set values passed into the program in registers. */
	(*p->p_sysent->sv_setregs)(td, imgp, (u_long)(uintptr_t)stack_base);

	vfs_mark_atime(imgp->vp, td->td_ucred);

	SDT_PROBE1(proc, , , exec__success, args->fname);

exec_fail_dealloc:
	if (imgp->firstpage != NULL)
		exec_unmap_first_page(imgp);

	if (imgp->vp != NULL) {
		if (args->fname)
			NDFREE(&nd, NDF_ONLY_PNBUF);
		if (imgp->opened)
			VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
		if (error != 0)
			vput(imgp->vp);
		else
			VOP_UNLOCK(imgp->vp, 0);
	}

	if (imgp->object != NULL)
		vm_object_deallocate(imgp->object);

	free(imgp->freepath, M_TEMP);

	if (error == 0) {
		if (p->p_ptevents & PTRACE_EXEC) {
			PROC_LOCK(p);
			if (p->p_ptevents & PTRACE_EXEC)
				td->td_dbgflags |= TDB_EXEC;
			PROC_UNLOCK(p);
		}

		/*
		 * Stop the process here if its stop event mask has
		 * the S_EXEC bit set.
		 */
		STOPEVENT(p, S_EXEC, 0);
	} else {
exec_fail:
		/* we're done here, clear P_INEXEC */
		PROC_LOCK(p);
		p->p_flag &= ~P_INEXEC;
		PROC_UNLOCK(p);

		SDT_PROBE1(proc, , , exec__failure, error);
	}

	if (imgp->newcred != NULL && oldcred != NULL)
		crfree(imgp->newcred);

#ifdef MAC
	mac_execve_exit(imgp);
	mac_execve_interpreter_exit(interpvplabel);
#endif
	exec_free_args(args);

	/*
	 * Handle deferred decrement of ref counts.
	 */
	if (oldtextvp != NULL)
		vrele(oldtextvp);
#ifdef KTRACE
	if (tracevp != NULL)
		vrele(tracevp);
	if (tracecred != NULL)
		crfree(tracecred);
#endif
	pargs_drop(oldargs);
	pargs_drop(newargs);
	if (oldsigacts != NULL)
		sigacts_free(oldsigacts);
	if (euip != NULL)
		uifree(euip);

	if (error && imgp->vmspace_destroyed) {
		/* sorry, no more process anymore. exit gracefully */
		exit1(td, 0, SIGABRT);
		/* NOT REACHED */
	}

#ifdef KTRACE
	if (error == 0)
		ktrprocctor(p);
#endif

	/*
	 * We don't want cpu_set_syscall_retval() to overwrite any of
	 * the register values put in place by exec_setregs().
	 * Implementations of cpu_set_syscall_retval() will leave
	 * registers unmodified when returning EJUSTRETURN.
	 */
	return (error == 0 ? EJUSTRETURN : error);
}
Esempio n. 29
0
static int
link_elf_load_file(linker_class_t cls, const char *filename,
    linker_file_t *result)
{
	struct nameidata nd;
	struct thread *td = curthread;	/* XXX */
	Elf_Ehdr *hdr;
	Elf_Shdr *shdr;
	Elf_Sym *es;
	int nbytes, i, j;
	vm_offset_t mapbase;
	size_t mapsize;
	int error = 0;
	ssize_t resid;
	int flags;
	elf_file_t ef;
	linker_file_t lf;
	int symtabindex;
	int symstrindex;
	int shstrindex;
	int nsym;
	int pb, rl, ra;
	int alignmask;

	shdr = NULL;
	lf = NULL;
	mapsize = 0;
	hdr = NULL;

	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
	flags = FREAD;
	error = vn_open(&nd, &flags, 0, NULL);
	if (error)
		return error;
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (nd.ni_vp->v_type != VREG) {
		error = ENOEXEC;
		goto out;
	}
#ifdef MAC
	error = mac_kld_check_load(td->td_ucred, nd.ni_vp);
	if (error) {
		goto out;
	}
#endif

	/* Read the elf header from the file. */
	hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK);
	error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)hdr, sizeof(*hdr), 0,
	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
	    &resid, td);
	if (error)
		goto out;
	if (resid != 0){
		error = ENOEXEC;
		goto out;
	}

	if (!IS_ELF(*hdr)) {
		error = ENOEXEC;
		goto out;
	}

	if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
	    || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
		link_elf_error(filename, "Unsupported file layout");
		error = ENOEXEC;
		goto out;
	}
	if (hdr->e_ident[EI_VERSION] != EV_CURRENT
	    || hdr->e_version != EV_CURRENT) {
		link_elf_error(filename, "Unsupported file version");
		error = ENOEXEC;
		goto out;
	}
	if (hdr->e_type != ET_REL) {
		error = ENOSYS;
		goto out;
	}
	if (hdr->e_machine != ELF_TARG_MACH) {
		link_elf_error(filename, "Unsupported machine");
		error = ENOEXEC;
		goto out;
	}

	lf = linker_make_file(filename, &link_elf_class);
	if (!lf) {
		error = ENOMEM;
		goto out;
	}
	ef = (elf_file_t) lf;
	ef->nprogtab = 0;
	ef->e_shdr = 0;
	ef->nreltab = 0;
	ef->nrelatab = 0;

	/* Allocate and read in the section header */
	nbytes = hdr->e_shnum * hdr->e_shentsize;
	if (nbytes == 0 || hdr->e_shoff == 0 ||
	    hdr->e_shentsize != sizeof(Elf_Shdr)) {
		error = ENOEXEC;
		goto out;
	}
	shdr = malloc(nbytes, M_LINKER, M_WAITOK);
	ef->e_shdr = shdr;
	error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff,
	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td);
	if (error)
		goto out;
	if (resid) {
		error = ENOEXEC;
		goto out;
	}

	/* Scan the section header for information and table sizing. */
	nsym = 0;
	symtabindex = -1;
	symstrindex = -1;
	for (i = 0; i < hdr->e_shnum; i++) {
		if (shdr[i].sh_size == 0)
			continue;
		switch (shdr[i].sh_type) {
		case SHT_PROGBITS:
		case SHT_NOBITS:
			ef->nprogtab++;
			break;
		case SHT_SYMTAB:
			nsym++;
			symtabindex = i;
			symstrindex = shdr[i].sh_link;
			break;
		case SHT_REL:
			ef->nreltab++;
			break;
		case SHT_RELA:
			ef->nrelatab++;
			break;
		case SHT_STRTAB:
			break;
		}
	}
	if (ef->nprogtab == 0) {
		link_elf_error(filename, "file has no contents");
		error = ENOEXEC;
		goto out;
	}
	if (nsym != 1) {
		/* Only allow one symbol table for now */
		link_elf_error(filename, "file has no valid symbol table");
		error = ENOEXEC;
		goto out;
	}
	if (symstrindex < 0 || symstrindex > hdr->e_shnum ||
	    shdr[symstrindex].sh_type != SHT_STRTAB) {
		link_elf_error(filename, "file has invalid symbol strings");
		error = ENOEXEC;
		goto out;
	}

	/* Allocate space for tracking the load chunks */
	if (ef->nprogtab != 0)
		ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab),
		    M_LINKER, M_WAITOK | M_ZERO);
	if (ef->nreltab != 0)
		ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab),
		    M_LINKER, M_WAITOK | M_ZERO);
	if (ef->nrelatab != 0)
		ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab),
		    M_LINKER, M_WAITOK | M_ZERO);

	if (symtabindex == -1)
		panic("lost symbol table index");
	/* Allocate space for and load the symbol table */
	ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym);
	ef->ddbsymtab = malloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK);
	error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->ddbsymtab,
	    shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset,
	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
	    &resid, td);
	if (error)
		goto out;
	if (resid != 0){
		error = EINVAL;
		goto out;
	}

	if (symstrindex == -1)
		panic("lost symbol string index");
	/* Allocate space for and load the symbol strings */
	ef->ddbstrcnt = shdr[symstrindex].sh_size;
	ef->ddbstrtab = malloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK);
	error = vn_rdwr(UIO_READ, nd.ni_vp, ef->ddbstrtab,
	    shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset,
	    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
	    &resid, td);
	if (error)
		goto out;
	if (resid != 0){
		error = EINVAL;
		goto out;
	}

	/* Do we have a string table for the section names?  */
	shstrindex = -1;
	if (hdr->e_shstrndx != 0 &&
	    shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) {
		shstrindex = hdr->e_shstrndx;
		ef->shstrcnt = shdr[shstrindex].sh_size;
		ef->shstrtab = malloc(shdr[shstrindex].sh_size, M_LINKER,
		    M_WAITOK);
		error = vn_rdwr(UIO_READ, nd.ni_vp, ef->shstrtab,
		    shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset,
		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
		    &resid, td);
		if (error)
			goto out;
		if (resid != 0){
			error = EINVAL;
			goto out;
		}
	}

	/* Size up code/data(progbits) and bss(nobits). */
	alignmask = 0;
	for (i = 0; i < hdr->e_shnum; i++) {
		if (shdr[i].sh_size == 0)
			continue;
		switch (shdr[i].sh_type) {
		case SHT_PROGBITS:
		case SHT_NOBITS:
			alignmask = shdr[i].sh_addralign - 1;
			mapsize += alignmask;
			mapsize &= ~alignmask;
			mapsize += shdr[i].sh_size;
			break;
		}
	}

	/*
	 * We know how much space we need for the text/data/bss/etc.
	 * This stuff needs to be in a single chunk so that profiling etc
	 * can get the bounds and gdb can associate offsets with modules
	 */
	ef->object = vm_object_allocate(OBJT_DEFAULT,
	    round_page(mapsize) >> PAGE_SHIFT);
	if (ef->object == NULL) {
		error = ENOMEM;
		goto out;
	}
	ef->address = (caddr_t) vm_map_min(kernel_map);

	/*
	 * In order to satisfy amd64's architectural requirements on the
	 * location of code and data in the kernel's address space, request a
	 * mapping that is above the kernel.  
	 */
#ifdef __amd64__
	mapbase = KERNBASE;
#else
	mapbase = VM_MIN_KERNEL_ADDRESS;
#endif
	error = vm_map_find(kernel_map, ef->object, 0, &mapbase,
	    round_page(mapsize), 0, VMFS_OPTIMAL_SPACE, VM_PROT_ALL,
	    VM_PROT_ALL, 0);
	if (error) {
		vm_object_deallocate(ef->object);
		ef->object = 0;
		goto out;
	}

	/* Wire the pages */
	error = vm_map_wire(kernel_map, mapbase,
	    mapbase + round_page(mapsize),
	    VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
	if (error != KERN_SUCCESS) {
		error = ENOMEM;
		goto out;
	}

	/* Inform the kld system about the situation */
	lf->address = ef->address = (caddr_t)mapbase;
	lf->size = mapsize;

	/*
	 * Now load code/data(progbits), zero bss(nobits), allocate space for
	 * and load relocs
	 */
	pb = 0;
	rl = 0;
	ra = 0;
	alignmask = 0;
	for (i = 0; i < hdr->e_shnum; i++) {
		if (shdr[i].sh_size == 0)
			continue;
		switch (shdr[i].sh_type) {
		case SHT_PROGBITS:
		case SHT_NOBITS:
			alignmask = shdr[i].sh_addralign - 1;
			mapbase += alignmask;
			mapbase &= ~alignmask;
			if (ef->shstrtab && shdr[i].sh_name != 0)
				ef->progtab[pb].name =
				    ef->shstrtab + shdr[i].sh_name;
			else if (shdr[i].sh_type == SHT_PROGBITS)
				ef->progtab[pb].name = "<<PROGBITS>>";
			else
				ef->progtab[pb].name = "<<NOBITS>>";
			if (ef->progtab[pb].name != NULL && 
			    !strcmp(ef->progtab[pb].name, DPCPU_SETNAME))
				ef->progtab[pb].addr =
				    dpcpu_alloc(shdr[i].sh_size);
#ifdef VIMAGE
			else if (ef->progtab[pb].name != NULL &&
			    !strcmp(ef->progtab[pb].name, VNET_SETNAME))
				ef->progtab[pb].addr =
				    vnet_data_alloc(shdr[i].sh_size);
#endif
			else
				ef->progtab[pb].addr =
				    (void *)(uintptr_t)mapbase;
			if (ef->progtab[pb].addr == NULL) {
				error = ENOSPC;
				goto out;
			}
			ef->progtab[pb].size = shdr[i].sh_size;
			ef->progtab[pb].sec = i;
			if (shdr[i].sh_type == SHT_PROGBITS) {
				error = vn_rdwr(UIO_READ, nd.ni_vp,
				    ef->progtab[pb].addr,
				    shdr[i].sh_size, shdr[i].sh_offset,
				    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
				    NOCRED, &resid, td);
				if (error)
					goto out;
				if (resid != 0){
					error = EINVAL;
					goto out;
				}
				/* Initialize the per-cpu or vnet area. */
				if (ef->progtab[pb].addr != (void *)mapbase &&
				    !strcmp(ef->progtab[pb].name, DPCPU_SETNAME))
					dpcpu_copy(ef->progtab[pb].addr,
					    shdr[i].sh_size);
#ifdef VIMAGE
				else if (ef->progtab[pb].addr !=
				    (void *)mapbase &&
				    !strcmp(ef->progtab[pb].name, VNET_SETNAME))
					vnet_data_copy(ef->progtab[pb].addr,
					    shdr[i].sh_size);
#endif
			} else
				bzero(ef->progtab[pb].addr, shdr[i].sh_size);

			/* Update all symbol values with the offset. */
			for (j = 0; j < ef->ddbsymcnt; j++) {
				es = &ef->ddbsymtab[j];
				if (es->st_shndx != i)
					continue;
				es->st_value += (Elf_Addr)ef->progtab[pb].addr;
			}
			mapbase += shdr[i].sh_size;
			pb++;
			break;
		case SHT_REL:
			ef->reltab[rl].rel = malloc(shdr[i].sh_size, M_LINKER,
			    M_WAITOK);
			ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel);
			ef->reltab[rl].sec = shdr[i].sh_info;
			error = vn_rdwr(UIO_READ, nd.ni_vp,
			    (void *)ef->reltab[rl].rel,
			    shdr[i].sh_size, shdr[i].sh_offset,
			    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
			    &resid, td);
			if (error)
				goto out;
			if (resid != 0){
				error = EINVAL;
				goto out;
			}
			rl++;
			break;
		case SHT_RELA:
			ef->relatab[ra].rela = malloc(shdr[i].sh_size, M_LINKER,
			    M_WAITOK);
			ef->relatab[ra].nrela =
			    shdr[i].sh_size / sizeof(Elf_Rela);
			ef->relatab[ra].sec = shdr[i].sh_info;
			error = vn_rdwr(UIO_READ, nd.ni_vp,
			    (void *)ef->relatab[ra].rela,
			    shdr[i].sh_size, shdr[i].sh_offset,
			    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
			    &resid, td);
			if (error)
				goto out;
			if (resid != 0){
				error = EINVAL;
				goto out;
			}
			ra++;
			break;
		}
	}
	if (pb != ef->nprogtab)
		panic("lost progbits");
	if (rl != ef->nreltab)
		panic("lost reltab");
	if (ra != ef->nrelatab)
		panic("lost relatab");
	if (mapbase != (vm_offset_t)ef->address + mapsize)
		panic("mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n",
		    (u_long)mapbase, ef->address, (u_long)mapsize,
		    (u_long)(vm_offset_t)ef->address + mapsize);

	/* Local intra-module relocations */
	link_elf_reloc_local(lf);

	/* Pull in dependencies */
	VOP_UNLOCK(nd.ni_vp, 0);
	error = linker_load_dependencies(lf);
	vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
	if (error)
		goto out;

	/* External relocations */
	error = relocate_file(ef);
	if (error)
		goto out;

	/* Notify MD code that a module is being loaded. */
	error = elf_cpu_load_file(lf);
	if (error)
		goto out;

	*result = lf;

out:
	VOP_UNLOCK(nd.ni_vp, 0);
	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
	if (error && lf)
		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
	if (hdr)
		free(hdr, M_LINKER);

	return error;
}
Esempio n. 30
0
/*
 * Destroy old address space, and allocate a new stack.
 *	The new stack is only sgrowsiz large because it is grown
 *	automatically on a page fault.
 */
int
exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv)
{
	int error;
	struct proc *p = imgp->proc;
	struct vmspace *vmspace = p->p_vmspace;
	vm_object_t obj;
	struct rlimit rlim_stack;
	vm_offset_t sv_minuser, stack_addr;
	vm_map_t map;
	u_long ssiz;

	imgp->vmspace_destroyed = 1;
	imgp->sysent = sv;

	/* May be called with Giant held */
	EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp);

	/*
	 * Blow away entire process VM, if address space not shared,
	 * otherwise, create a new VM space so that other threads are
	 * not disrupted
	 */
	map = &vmspace->vm_map;
	if (map_at_zero)
		sv_minuser = sv->sv_minuser;
	else
		sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
	    vm_map_max(map) == sv->sv_maxuser &&
	    cpu_exec_vmspace_reuse(p, map)) {
		shmexit(vmspace);
		pmap_remove_pages(vmspace_pmap(vmspace));
		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
		/*
		 * An exec terminates mlockall(MCL_FUTURE), ASLR state
		 * must be re-evaluated.
		 */
		vm_map_lock(map);
		vm_map_modflags(map, 0, MAP_WIREFUTURE | MAP_ASLR |
		    MAP_ASLR_IGNSTART);
		vm_map_unlock(map);
	} else {
		error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
		if (error)
			return (error);
		vmspace = p->p_vmspace;
		map = &vmspace->vm_map;
	}
	map->flags |= imgp->map_flags;

	/* Map a shared page */
	obj = sv->sv_shared_page_obj;
	if (obj != NULL) {
		vm_object_reference(obj);
		error = vm_map_fixed(map, obj, 0,
		    sv->sv_shared_page_base, sv->sv_shared_page_len,
		    VM_PROT_READ | VM_PROT_EXECUTE,
		    VM_PROT_READ | VM_PROT_EXECUTE,
		    MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
		if (error != KERN_SUCCESS) {
			vm_object_deallocate(obj);
			return (vm_mmap_to_errno(error));
		}
	}

	/* Allocate a new stack */
	if (imgp->stack_sz != 0) {
		ssiz = trunc_page(imgp->stack_sz);
		PROC_LOCK(p);
		lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack);
		PROC_UNLOCK(p);
		if (ssiz > rlim_stack.rlim_max)
			ssiz = rlim_stack.rlim_max;
		if (ssiz > rlim_stack.rlim_cur) {
			rlim_stack.rlim_cur = ssiz;
			kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack);
		}
	} else if (sv->sv_maxssiz != NULL) {
		ssiz = *sv->sv_maxssiz;
	} else {
		ssiz = maxssiz;
	}
	stack_addr = sv->sv_usrstack - ssiz;
	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
	    obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
	if (error != KERN_SUCCESS)
		return (vm_mmap_to_errno(error));

	/*
	 * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they
	 * are still used to enforce the stack rlimit on the process stack.
	 */
	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
	vmspace->vm_maxsaddr = (char *)stack_addr;

	return (0);
}