/*
 * Run the chain and if the bottom-most object is a vnode-type lock the
 * underlying vnode.  A locked vnode or NULL is returned.
 */
struct vnode *
vnode_pager_lock(vm_object_t object)
{
	struct vnode *vp = NULL;
	vm_object_t lobject;
	vm_object_t tobject;
	int error;

	if (object == NULL)
		return(NULL);

	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
	lobject = object;

	while (lobject->type != OBJT_VNODE) {
		if (lobject->flags & OBJ_DEAD)
			break;
		tobject = lobject->backing_object;
		if (tobject == NULL)
			break;
		vm_object_hold_shared(tobject);
		if (tobject == lobject->backing_object) {
			if (lobject != object) {
				vm_object_lock_swap();
				vm_object_drop(lobject);
			}
			lobject = tobject;
		} else {
			vm_object_drop(tobject);
		}
	}
	while (lobject->type == OBJT_VNODE &&
	       (lobject->flags & OBJ_DEAD) == 0) {
		/*
		 * Extract the vp
		 */
		vp = lobject->handle;
		error = vget(vp, LK_SHARED | LK_RETRY | LK_CANRECURSE);
		if (error == 0) {
			if (lobject->handle == vp)
				break;
			vput(vp);
		} else {
			kprintf("vnode_pager_lock: vp %p error %d "
				"lockstatus %d, retrying\n",
				vp, error,
				lockstatus(&vp->v_lock, curthread));
			tsleep(object->handle, 0, "vnpgrl", hz);
		}
		vp = NULL;
	}
	if (lobject != object)
		vm_object_drop(lobject);
	return (vp);
}
예제 #2
0
/*
 * Add a ref to a vnode's existing VM object, return the object or
 * NULL if the vnode did not have one.  This does not create the
 * object (we can't since we don't know what the proper blocksize/boff
 * is to match the VFS's use of the buffer cache).
 */
vm_object_t
vnode_pager_reference(struct vnode *vp)
{
	vm_object_t object;

	/*
	 * Prevent race condition when allocating the object. This
	 * can happen with NFS vnodes since the nfsnode isn't locked.
	 *
	 * Serialize potential vnode/object teardowns and interlocks
	 */
	lwkt_gettoken(&vp->v_token);
	while (vp->v_flag & VOLOCK) {
		vsetflags(vp, VOWANT);
		tsleep(vp, 0, "vnpobj", 0);
	}
	vsetflags(vp, VOLOCK);
	lwkt_reltoken(&vp->v_token);

	/*
	 * Prevent race conditions against deallocation of the VM
	 * object.
	 */
	while ((object = vp->v_object) != NULL) {
		vm_object_hold(object);
		if ((object->flags & OBJ_DEAD) == 0)
			break;
		vm_object_dead_sleep(object, "vadead");
		vm_object_drop(object);
	}

	/*
	 * The object is expected to exist, the caller will handle
	 * NULL returns if it does not.
	 */
	if (object) {
		object->ref_count++;
		vref(vp);
	}

	lwkt_gettoken(&vp->v_token);
	vclrflags(vp, VOLOCK);
	if (vp->v_flag & VOWANT) {
		vclrflags(vp, VOWANT);
		wakeup(vp);
	}
	lwkt_reltoken(&vp->v_token);
	if (object)
		vm_object_drop(object);

	return (object);
}
static int old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset,
    int prot, vm_page_t *mres)
{
	vm_paddr_t paddr;
	vm_page_t page;
	vm_offset_t pidx = OFF_TO_IDX(offset);
	cdev_t dev;

	page = *mres;
	dev = object->handle;

	paddr = pmap_phys_address(
		    dev_dmmap(dev, offset, prot, NULL));
	KASSERT(paddr != -1,("dev_pager_getpage: map function returns error"));
	KKASSERT(object->type == OBJT_DEVICE);

	if (page->flags & PG_FICTITIOUS) {
		/*
		 * If the passed in reqpage page is already a fake page,
		 * update it with the new physical address.
		 */
		page->phys_addr = paddr;
		page->valid = VM_PAGE_BITS_ALL;
	} else {
		/*
		 * Replace the passed in reqpage page with our own fake page
		 * and free up all the original pages.
		 */
		page = dev_pager_getfake(paddr, object->memattr);
		TAILQ_INSERT_TAIL(&object->un_pager.devp.devp_pglist,
				  page, pageq);
		vm_object_hold(object);
		vm_page_free(*mres);
		if (vm_page_insert(page, object, pidx) == FALSE) {
			panic("dev_pager_getpage: page (%p,%016jx) exists",
			      object, (uintmax_t)pidx);
		}
		vm_object_drop(object);
	}

	return (VM_PAGER_OK);
}
예제 #4
0
static int
link_elf_obj_load_file(const char *filename, linker_file_t * result)
{
	struct nlookupdata nd;
	struct thread  *td = curthread;	/* XXX */
	struct proc    *p = td->td_proc;
	char           *pathname;
	struct vnode   *vp;
	Elf_Ehdr       *hdr;
	Elf_Shdr       *shdr;
	Elf_Sym        *es;
	int		nbytes, i, j;
	vm_offset_t	mapbase;
	size_t		mapsize;
	int		error = 0;
	int		resid;
	elf_file_t	ef;
	linker_file_t	lf;
	int		symtabindex;
	int		symstrindex;
	int		shstrindex;
	int		nsym;
	int		pb, rl, ra;
	int		alignmask;

	/* XXX Hack for firmware loading where p == NULL */
	if (p == NULL) {
		p = &proc0;
	}

	KKASSERT(p != NULL);
	if (p->p_ucred == NULL) {
		kprintf("link_elf_obj_load_file: cannot load '%s' from filesystem"
			" this early\n", filename);
		return ENOENT;
	}
	shdr = NULL;
	lf = NULL;
	mapsize = 0;
	hdr = NULL;
	pathname = linker_search_path(filename);
	if (pathname == NULL)
		return ENOENT;

	error = nlookup_init(&nd, pathname, UIO_SYSSPACE, NLC_FOLLOW | NLC_LOCKVP);
	if (error == 0)
		error = vn_open(&nd, NULL, FREAD, 0);
	kfree(pathname, M_LINKER);
	if (error) {
		nlookup_done(&nd);
		return error;
	}
	vp = nd.nl_open_vp;
	nd.nl_open_vp = NULL;
	nlookup_done(&nd);

	/*
	 * Read the elf header from the file.
	 */
	hdr = kmalloc(sizeof(*hdr), M_LINKER, M_WAITOK);
	error = vn_rdwr(UIO_READ, vp, (void *)hdr, sizeof(*hdr), 0,
			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
	if (error)
		goto out;
	if (resid != 0) {
		error = ENOEXEC;
		goto out;
	}
	if (!IS_ELF(*hdr)) {
		error = ENOEXEC;
		goto out;
	}

	if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
	    || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
		link_elf_obj_error(filename, "Unsupported file layout");
		error = ENOEXEC;
		goto out;
	}
	if (hdr->e_ident[EI_VERSION] != EV_CURRENT
	    || hdr->e_version != EV_CURRENT) {
		link_elf_obj_error(filename, "Unsupported file version");
		error = ENOEXEC;
		goto out;
	}
	if (hdr->e_type != ET_REL) {
		error = ENOSYS;
		goto out;
	}
	if (hdr->e_machine != ELF_TARG_MACH) {
		link_elf_obj_error(filename, "Unsupported machine");
		error = ENOEXEC;
		goto out;
	}

	ef = kmalloc(sizeof(struct elf_file), M_LINKER, M_WAITOK | M_ZERO);
	lf = linker_make_file(filename, ef, &link_elf_obj_file_ops);
	if (lf == NULL) {
		kfree(ef, M_LINKER);
		error = ENOMEM;
		goto out;
	}
	ef->nprogtab = 0;
	ef->e_shdr = NULL;
	ef->nreltab = 0;
	ef->nrelatab = 0;

	/* Allocate and read in the section header */
	nbytes = hdr->e_shnum * hdr->e_shentsize;
	if (nbytes == 0 || hdr->e_shoff == 0 ||
	    hdr->e_shentsize != sizeof(Elf_Shdr)) {
		error = ENOEXEC;
		goto out;
	}
	shdr = kmalloc(nbytes, M_LINKER, M_WAITOK);
	ef->e_shdr = shdr;
	error = vn_rdwr(UIO_READ, vp, (caddr_t) shdr, nbytes, hdr->e_shoff,
			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
	if (error)
		goto out;
	if (resid) {
		error = ENOEXEC;
		goto out;
	}
	/* Scan the section header for information and table sizing. */
	nsym = 0;
	symtabindex = -1;
	symstrindex = -1;
	for (i = 0; i < hdr->e_shnum; i++) {
		if (shdr[i].sh_size == 0)
			continue;
		switch (shdr[i].sh_type) {
		case SHT_PROGBITS:
		case SHT_NOBITS:
			ef->nprogtab++;
			break;
		case SHT_SYMTAB:
			nsym++;
			symtabindex = i;
			symstrindex = shdr[i].sh_link;
			break;
		case SHT_REL:
			ef->nreltab++;
			break;
		case SHT_RELA:
			ef->nrelatab++;
			break;
		case SHT_STRTAB:
			break;
		}
	}
	if (ef->nprogtab == 0) {
		link_elf_obj_error(filename, "file has no contents");
		error = ENOEXEC;
		goto out;
	}
	if (nsym != 1) {
		/* Only allow one symbol table for now */
		link_elf_obj_error(filename, "file has no valid symbol table");
		error = ENOEXEC;
		goto out;
	}
	if (symstrindex < 0 || symstrindex > hdr->e_shnum ||
	    shdr[symstrindex].sh_type != SHT_STRTAB) {
		link_elf_obj_error(filename, "file has invalid symbol strings");
		error = ENOEXEC;
		goto out;
	}
	/* Allocate space for tracking the load chunks */
	if (ef->nprogtab != 0)
		ef->progtab = kmalloc(ef->nprogtab * sizeof(*ef->progtab),
				      M_LINKER, M_WAITOK | M_ZERO);
	if (ef->nreltab != 0)
		ef->reltab = kmalloc(ef->nreltab * sizeof(*ef->reltab),
				     M_LINKER, M_WAITOK | M_ZERO);
	if (ef->nrelatab != 0)
		ef->relatab = kmalloc(ef->nrelatab * sizeof(*ef->relatab),
				      M_LINKER, M_WAITOK | M_ZERO);
	if ((ef->nprogtab != 0 && ef->progtab == NULL) ||
	    (ef->nreltab != 0 && ef->reltab == NULL) ||
	    (ef->nrelatab != 0 && ef->relatab == NULL)) {
		error = ENOMEM;
		goto out;
	}
	if (symtabindex == -1)
		panic("lost symbol table index");
	/* Allocate space for and load the symbol table */
	ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym);
	ef->ddbsymtab = kmalloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK);
	error = vn_rdwr(UIO_READ, vp, (void *)ef->ddbsymtab,
			shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset,
			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
	if (error)
		goto out;
	if (resid != 0) {
		error = EINVAL;
		goto out;
	}
	if (symstrindex == -1)
		panic("lost symbol string index");
	/* Allocate space for and load the symbol strings */
	ef->ddbstrcnt = shdr[symstrindex].sh_size;
	ef->ddbstrtab = kmalloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK);
	error = vn_rdwr(UIO_READ, vp, ef->ddbstrtab,
			shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset,
			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
	if (error)
		goto out;
	if (resid != 0) {
		error = EINVAL;
		goto out;
	}
	/* Do we have a string table for the section names?  */
	shstrindex = -1;
	if (hdr->e_shstrndx != 0 &&
	    shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) {
		shstrindex = hdr->e_shstrndx;
		ef->shstrcnt = shdr[shstrindex].sh_size;
		ef->shstrtab = kmalloc(shdr[shstrindex].sh_size, M_LINKER,
				       M_WAITOK);
		error = vn_rdwr(UIO_READ, vp, ef->shstrtab,
				shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset,
				UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
		if (error)
			goto out;
		if (resid != 0) {
			error = EINVAL;
			goto out;
		}
	}
	/* Size up code/data(progbits) and bss(nobits). */
	alignmask = 0;
	for (i = 0; i < hdr->e_shnum; i++) {
		if (shdr[i].sh_size == 0)
			continue;
		switch (shdr[i].sh_type) {
		case SHT_PROGBITS:
		case SHT_NOBITS:
			alignmask = shdr[i].sh_addralign - 1;
			mapsize += alignmask;
			mapsize &= ~alignmask;
			mapsize += shdr[i].sh_size;
			break;
		}
	}

	/*
	 * We know how much space we need for the text/data/bss/etc. This
	 * stuff needs to be in a single chunk so that profiling etc can get
	 * the bounds and gdb can associate offsets with modules
	 */
	ef->object = vm_object_allocate(OBJT_DEFAULT,
					round_page(mapsize) >> PAGE_SHIFT);
	if (ef->object == NULL) {
		error = ENOMEM;
		goto out;
	}
	vm_object_hold(ef->object);
	vm_object_reference_locked(ef->object);
	ef->address = (caddr_t) vm_map_min(&kernel_map);
	ef->bytes = 0;

	/*
	 * In order to satisfy x86_64's architectural requirements on the
	 * location of code and data in the kernel's address space, request a
	 * mapping that is above the kernel.
	 *
	 * vkernel64's text+data is outside the managed VM space entirely.
	 */
#if defined(__x86_64__) && defined(_KERNEL_VIRTUAL)
	error = vkernel_module_memory_alloc(&mapbase, round_page(mapsize));
	vm_object_drop(ef->object);
#else
	mapbase = KERNBASE;
	error = vm_map_find(&kernel_map, ef->object, NULL,
			    0, &mapbase, round_page(mapsize),
			    PAGE_SIZE,
			    TRUE, VM_MAPTYPE_NORMAL,
			    VM_PROT_ALL, VM_PROT_ALL, FALSE);
	vm_object_drop(ef->object);
	if (error) {
		vm_object_deallocate(ef->object);
		ef->object = NULL;
		goto out;
	}
	/* Wire the pages */
	error = vm_map_wire(&kernel_map, mapbase,
			    mapbase + round_page(mapsize), 0);
#endif
	if (error != KERN_SUCCESS) {
		error = ENOMEM;
		goto out;
	}
	/* Inform the kld system about the situation */
	lf->address = ef->address = (caddr_t) mapbase;
	lf->size = round_page(mapsize);
	ef->bytes = mapsize;

	/*
	 * Now load code/data(progbits), zero bss(nobits), allocate space for
	 * and load relocs
	 */
	pb = 0;
	rl = 0;
	ra = 0;
	alignmask = 0;
	for (i = 0; i < hdr->e_shnum; i++) {
		if (shdr[i].sh_size == 0)
			continue;
		switch (shdr[i].sh_type) {
		case SHT_PROGBITS:
		case SHT_NOBITS:
			alignmask = shdr[i].sh_addralign - 1;
			mapbase += alignmask;
			mapbase &= ~alignmask;
			if (ef->shstrtab && shdr[i].sh_name != 0)
				ef->progtab[pb].name =
					ef->shstrtab + shdr[i].sh_name;
			else if (shdr[i].sh_type == SHT_PROGBITS)
				ef->progtab[pb].name = "<<PROGBITS>>";
			else
				ef->progtab[pb].name = "<<NOBITS>>";
#if 0
			if (ef->progtab[pb].name != NULL &&
			    !strcmp(ef->progtab[pb].name, "set_pcpu"))
				ef->progtab[pb].addr =
					dpcpu_alloc(shdr[i].sh_size);
#ifdef VIMAGE
			else if (ef->progtab[pb].name != NULL &&
				 !strcmp(ef->progtab[pb].name, VNET_SETNAME))
				ef->progtab[pb].addr =
					vnet_data_alloc(shdr[i].sh_size);
#endif
			else
#endif
				ef->progtab[pb].addr =
					(void *)(uintptr_t) mapbase;
			if (ef->progtab[pb].addr == NULL) {
				error = ENOSPC;
				goto out;
			}
			ef->progtab[pb].size = shdr[i].sh_size;
			ef->progtab[pb].sec = i;
			if (shdr[i].sh_type == SHT_PROGBITS) {
				error = vn_rdwr(UIO_READ, vp,
						ef->progtab[pb].addr,
						shdr[i].sh_size, shdr[i].sh_offset,
						UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred,
						&resid);
				if (error)
					goto out;
				if (resid != 0) {
					error = EINVAL;
					goto out;
				}
#if 0
				/* Initialize the per-cpu or vnet area. */
				if (ef->progtab[pb].addr != (void *)mapbase &&
				    !strcmp(ef->progtab[pb].name, "set_pcpu"))
					dpcpu_copy(ef->progtab[pb].addr,
						   shdr[i].sh_size);
#ifdef VIMAGE
				else if (ef->progtab[pb].addr !=
					 (void *)mapbase &&
					 !strcmp(ef->progtab[pb].name, VNET_SETNAME))
					vnet_data_copy(ef->progtab[pb].addr,
						       shdr[i].sh_size);
#endif
#endif
			} else
				bzero(ef->progtab[pb].addr, shdr[i].sh_size);

			/* Update all symbol values with the offset. */
			for (j = 0; j < ef->ddbsymcnt; j++) {
				es = &ef->ddbsymtab[j];
				if (es->st_shndx != i)
					continue;
				es->st_value += (Elf_Addr) ef->progtab[pb].addr;
			}
			mapbase += shdr[i].sh_size;
			pb++;
			break;
		case SHT_REL:
			ef->reltab[rl].rel = kmalloc(shdr[i].sh_size, M_LINKER, M_WAITOK);
			ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel);
			ef->reltab[rl].sec = shdr[i].sh_info;
			error = vn_rdwr(UIO_READ, vp,
					(void *)ef->reltab[rl].rel,
					shdr[i].sh_size, shdr[i].sh_offset,
					UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
			if (error)
				goto out;
			if (resid != 0) {
				error = EINVAL;
				goto out;
			}
			rl++;
			break;
		case SHT_RELA:
			ef->relatab[ra].rela = kmalloc(shdr[i].sh_size, M_LINKER, M_WAITOK);
			ef->relatab[ra].nrela = shdr[i].sh_size / sizeof(Elf_Rela);
			ef->relatab[ra].sec = shdr[i].sh_info;
			error = vn_rdwr(UIO_READ, vp,
					(void *)ef->relatab[ra].rela,
					shdr[i].sh_size, shdr[i].sh_offset,
					UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
			if (error)
				goto out;
			if (resid != 0) {
				error = EINVAL;
				goto out;
			}
			ra++;
			break;
		}
	}
	if (pb != ef->nprogtab)
		panic("lost progbits");
	if (rl != ef->nreltab)
		panic("lost reltab");
	if (ra != ef->nrelatab)
		panic("lost relatab");
	if (mapbase != (vm_offset_t) ef->address + mapsize)
		panic("mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)",
		      mapbase, ef->address, mapsize,
		      (vm_offset_t) ef->address + mapsize);

	/* Local intra-module relocations */
	link_elf_obj_reloc_local(lf);

	/* Pull in dependencies */
	error = linker_load_dependencies(lf);
	if (error)
		goto out;

	/* External relocations */
	error = relocate_file(lf);
	if (error)
		goto out;

	*result = lf;

out:
	if (error && lf)
		linker_file_unload(lf /*, LINKER_UNLOAD_FORCE */);
	if (hdr)
		kfree(hdr, M_LINKER);
	vn_unlock(vp);
	vn_close(vp, FREAD, NULL);

	return error;
}
vm_object_t
cdev_pager_allocate(void *handle, enum obj_type tp, struct cdev_pager_ops *ops,
	vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred)
{
	cdev_t dev;
	vm_object_t object;
	u_short color;

	/*
	 * Offset should be page aligned.
	 */
	if (foff & PAGE_MASK)
		return (NULL);

	size = round_page64(size);

	if (ops->cdev_pg_ctor(handle, size, prot, foff, cred, &color) != 0)
		return (NULL);

	/*
	 * Look up pager, creating as necessary.
	 */
	mtx_lock(&dev_pager_mtx);
	object = vm_pager_object_lookup(&dev_pager_object_list, handle);
	if (object == NULL) {
		/*
		 * Allocate object and associate it with the pager.
		 */
		object = vm_object_allocate_hold(tp,
						 OFF_TO_IDX(foff + size));
		object->handle = handle;
		object->un_pager.devp.ops = ops;
		object->un_pager.devp.dev = handle;
		TAILQ_INIT(&object->un_pager.devp.devp_pglist);

		/*
		 * handle is only a device for old_dev_pager_ctor.
		 */
		if (ops->cdev_pg_ctor == old_dev_pager_ctor) {
			dev = handle;
			dev->si_object = object;
		}

		TAILQ_INSERT_TAIL(&dev_pager_object_list, object,
		    pager_object_list);

		vm_object_drop(object);
	} else {
		/*
		 * Gain a reference to the object.
		 */
		vm_object_hold(object);
		vm_object_reference_locked(object);
		if (OFF_TO_IDX(foff + size) > object->size)
			object->size = OFF_TO_IDX(foff + size);
		vm_object_drop(object);
	}
	mtx_unlock(&dev_pager_mtx);

	return (object);
}
예제 #6
0
/*
 * mincore system call handler
 *
 * mincore_args(const void *addr, size_t len, char *vec)
 *
 * No requirements
 */
int
sys_mincore(struct mincore_args *uap)
{
	struct proc *p = curproc;
	vm_offset_t addr, first_addr;
	vm_offset_t end, cend;
	pmap_t pmap;
	vm_map_t map;
	char *vec;
	int error;
	int vecindex, lastvecindex;
	vm_map_entry_t current;
	vm_map_entry_t entry;
	int mincoreinfo;
	unsigned int timestamp;

	/*
	 * Make sure that the addresses presented are valid for user
	 * mode.
	 */
	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
	end = addr + (vm_size_t)round_page(uap->len);
	if (end < addr)
		return (EINVAL);
	if (VM_MAX_USER_ADDRESS > 0 && end > VM_MAX_USER_ADDRESS)
		return (EINVAL);

	/*
	 * Address of byte vector
	 */
	vec = uap->vec;

	map = &p->p_vmspace->vm_map;
	pmap = vmspace_pmap(p->p_vmspace);

	lwkt_gettoken(&map->token);
	vm_map_lock_read(map);
RestartScan:
	timestamp = map->timestamp;

	if (!vm_map_lookup_entry(map, addr, &entry))
		entry = entry->next;

	/*
	 * Do this on a map entry basis so that if the pages are not
	 * in the current processes address space, we can easily look
	 * up the pages elsewhere.
	 */
	lastvecindex = -1;
	for(current = entry;
		(current != &map->header) && (current->start < end);
		current = current->next) {

		/*
		 * ignore submaps (for now) or null objects
		 */
		if (current->maptype != VM_MAPTYPE_NORMAL &&
		    current->maptype != VM_MAPTYPE_VPAGETABLE) {
			continue;
		}
		if (current->object.vm_object == NULL)
			continue;
		
		/*
		 * limit this scan to the current map entry and the
		 * limits for the mincore call
		 */
		if (addr < current->start)
			addr = current->start;
		cend = current->end;
		if (cend > end)
			cend = end;

		/*
		 * scan this entry one page at a time
		 */
		while (addr < cend) {
			/*
			 * Check pmap first, it is likely faster, also
			 * it can provide info as to whether we are the
			 * one referencing or modifying the page.
			 *
			 * If we have to check the VM object, only mess
			 * around with normal maps.  Do not mess around
			 * with virtual page tables (XXX).
			 */
			mincoreinfo = pmap_mincore(pmap, addr);
			if (mincoreinfo == 0 &&
			    current->maptype == VM_MAPTYPE_NORMAL) {
				vm_pindex_t pindex;
				vm_ooffset_t offset;
				vm_page_t m;

				/*
				 * calculate the page index into the object
				 */
				offset = current->offset + (addr - current->start);
				pindex = OFF_TO_IDX(offset);

				/*
				 * if the page is resident, then gather 
				 * information about it.  spl protection is
				 * required to maintain the object 
				 * association.  And XXX what if the page is
				 * busy?  What's the deal with that?
				 *
				 * XXX vm_token - legacy for pmap_ts_referenced
				 *     in i386 and vkernel pmap code.
				 */
				lwkt_gettoken(&vm_token);
				vm_object_hold(current->object.vm_object);
				m = vm_page_lookup(current->object.vm_object,
						    pindex);
				if (m && m->valid) {
					mincoreinfo = MINCORE_INCORE;
					if (m->dirty ||
						pmap_is_modified(m))
						mincoreinfo |= MINCORE_MODIFIED_OTHER;
					if ((m->flags & PG_REFERENCED) ||
						pmap_ts_referenced(m)) {
						vm_page_flag_set(m, PG_REFERENCED);
						mincoreinfo |= MINCORE_REFERENCED_OTHER;
					}
				}
				vm_object_drop(current->object.vm_object);
				lwkt_reltoken(&vm_token);
			}

			/*
			 * subyte may page fault.  In case it needs to modify
			 * the map, we release the lock.
			 */
			vm_map_unlock_read(map);

			/*
			 * calculate index into user supplied byte vector
			 */
			vecindex = OFF_TO_IDX(addr - first_addr);

			/*
			 * If we have skipped map entries, we need to make sure that
			 * the byte vector is zeroed for those skipped entries.
			 */
			while((lastvecindex + 1) < vecindex) {
				error = subyte( vec + lastvecindex, 0);
				if (error) {
					error = EFAULT;
					goto done;
				}
				++lastvecindex;
			}

			/*
			 * Pass the page information to the user
			 */
			error = subyte( vec + vecindex, mincoreinfo);
			if (error) {
				error = EFAULT;
				goto done;
			}

			/*
			 * If the map has changed, due to the subyte, the previous
			 * output may be invalid.
			 */
			vm_map_lock_read(map);
			if (timestamp != map->timestamp)
				goto RestartScan;

			lastvecindex = vecindex;
			addr += PAGE_SIZE;
		}
	}

	/*
	 * subyte may page fault.  In case it needs to modify
	 * the map, we release the lock.
	 */
	vm_map_unlock_read(map);

	/*
	 * Zero the last entries in the byte vector.
	 */
	vecindex = OFF_TO_IDX(end - first_addr);
	while((lastvecindex + 1) < vecindex) {
		error = subyte( vec + lastvecindex, 0);
		if (error) {
			error = EFAULT;
			goto done;
		}
		++lastvecindex;
	}
	
	/*
	 * If the map has changed, due to the subyte, the previous
	 * output may be invalid.
	 */
	vm_map_lock_read(map);
	if (timestamp != map->timestamp)
		goto RestartScan;
	vm_map_unlock_read(map);

	error = 0;
done:
	lwkt_reltoken(&map->token);
	return (error);
}
예제 #7
0
/*
 * The map entries can *almost* be read with programs like cat.  However,
 * large maps need special programs to read.  It is not easy to implement
 * a program that can sense the required size of the buffer, and then
 * subsequently do a read with the appropriate size.  This operation cannot
 * be atomic.  The best that we can do is to allow the program to do a read
 * with an arbitrarily large buffer, and return as much as we can.  We can
 * return an error code if the buffer is too small (EFBIG), then the program
 * can try a bigger buffer.
 */
int
procfs_domap(struct proc *curp, struct lwp *lp, struct pfsnode *pfs,
	     struct uio *uio)
{
	struct proc *p = lp->lwp_proc;
	int len;
	struct vnode *vp;
	char *fullpath, *freepath;
	int error;
	vm_map_t map = &p->p_vmspace->vm_map;
	pmap_t pmap = vmspace_pmap(p->p_vmspace);
	vm_map_entry_t entry;
	char mebuffer[MEBUFFERSIZE];

	if (uio->uio_rw != UIO_READ)
		return (EOPNOTSUPP);

	if (uio->uio_offset != 0)
		return (0);
	
	error = 0;
	vm_map_lock_read(map);
	for (entry = map->header.next;
		((uio->uio_resid > 0) && (entry != &map->header));
		entry = entry->next) {
		vm_object_t obj, tobj, lobj;
		int ref_count, shadow_count, flags;
		vm_offset_t addr;
		vm_offset_t ostart;
		int resident, privateresident;
		char *type;

		if (entry->maptype != VM_MAPTYPE_NORMAL &&
		    entry->maptype != VM_MAPTYPE_VPAGETABLE) {
			continue;
		}

		obj = entry->object.vm_object;
		if (obj)
			vm_object_hold(obj);

		if (obj && (obj->shadow_count == 1))
			privateresident = obj->resident_page_count;
		else
			privateresident = 0;

		/*
		 * Use map->hint as a poor man's ripout detector.
		 */
		map->hint = entry;
		ostart = entry->start;

		/*
		 * Count resident pages (XXX can be horrible on 64-bit)
		 */
		resident = 0;
		addr = entry->start;
		while (addr < entry->end) {
			if (pmap_extract(pmap, addr))
				resident++;
			addr += PAGE_SIZE;
		}
		if (obj) {
			lobj = obj;
			while ((tobj = lobj->backing_object) != NULL) {
				KKASSERT(tobj != obj);
				vm_object_hold(tobj);
				if (tobj == lobj->backing_object) {
					if (lobj != obj) {
						vm_object_lock_swap();
						vm_object_drop(lobj);
					}
					lobj = tobj;
				} else {
					vm_object_drop(tobj);
				}
			}
		} else {
			lobj = NULL;
		}

		freepath = NULL;
		fullpath = "-";
		if (lobj) {
			switch(lobj->type) {
			default:
			case OBJT_DEFAULT:
				type = "default";
				vp = NULL;
				break;
			case OBJT_VNODE:
				type = "vnode";
				vp = lobj->handle;
				vref(vp);
				break;
			case OBJT_SWAP:
				type = "swap";
				vp = NULL;
				break;
			case OBJT_DEVICE:
				type = "device";
				vp = NULL;
				break;
			}
			
			flags = obj->flags;
			ref_count = obj->ref_count;
			shadow_count = obj->shadow_count;
			if (vp != NULL) {
				vn_fullpath(p, vp, &fullpath, &freepath, 1);
				vrele(vp);
			}
			if (lobj != obj)
				vm_object_drop(lobj);
		} else {
			type = "none";
			flags = 0;
			ref_count = 0;
			shadow_count = 0;
		}

		/*
		 * format:
		 *  start, end, res, priv res, cow, access, type, (fullpath).
		 */
		ksnprintf(mebuffer, sizeof(mebuffer),
#if LONG_BIT == 64
			  "0x%016lx 0x%016lx %d %d %p %s%s%s %d %d "
#else
			  "0x%08lx 0x%08lx %d %d %p %s%s%s %d %d "
#endif
			  "0x%04x %s %s %s %s\n",
			(u_long)entry->start, (u_long)entry->end,
			resident, privateresident, obj,
			(entry->protection & VM_PROT_READ)?"r":"-",
			(entry->protection & VM_PROT_WRITE)?"w":"-",
			(entry->protection & VM_PROT_EXECUTE)?"x":"-",
			ref_count, shadow_count, flags,
			(entry->eflags & MAP_ENTRY_COW)?"COW":"NCOW",
			(entry->eflags & MAP_ENTRY_NEEDS_COPY)?"NC":"NNC",
			type, fullpath);

		if (obj)
			vm_object_drop(obj);

		if (freepath != NULL) {
			kfree(freepath, M_TEMP);
			freepath = NULL;
		}

		len = strlen(mebuffer);
		if (len > uio->uio_resid) {
			error = EFBIG;
			break;
		}

		/*
		 * We cannot safely hold the map locked while accessing
		 * userspace as a VM fault might recurse the locked map.
		 */
		vm_map_unlock_read(map);
		error = uiomove(mebuffer, len, uio);
		vm_map_lock_read(map);
		if (error)
			break;

		/*
		 * We use map->hint as a poor man's ripout detector.  If
		 * it does not match the entry we set it to prior to
		 * unlocking the map the entry MIGHT now be stale.  In
		 * this case we do an expensive lookup to find our place
		 * in the iteration again.
		 */
		if (map->hint != entry) {
			vm_map_entry_t reentry;

			vm_map_lookup_entry(map, ostart, &reentry);
			entry = reentry;
		}
	}
	vm_map_unlock_read(map);

	return error;
}
/*
 * Lets the VM system know about a change in size for a file.
 * We adjust our own internal size and flush any cached pages in
 * the associated object that are affected by the size change.
 *
 * NOTE: This routine may be invoked as a result of a pager put
 * operation (possibly at object termination time), so we must be careful.
 *
 * NOTE: vp->v_filesize is initialized to NOOFFSET (-1), be sure that
 * we do not blow up on the case.  nsize will always be >= 0, however.
 */
void
vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize)
{
	vm_pindex_t nobjsize;
	vm_pindex_t oobjsize;
	vm_object_t object;

	object = vp->v_object;
	if (object == NULL)
		return;
	vm_object_hold(object);
	KKASSERT(vp->v_object == object);

	/*
	 * Hasn't changed size
	 */
	if (nsize == vp->v_filesize) {
		vm_object_drop(object);
		return;
	}

	/*
	 * Has changed size.  Adjust the VM object's size and v_filesize
	 * before we start scanning pages to prevent new pages from being
	 * allocated during the scan.
	 */
	nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
	oobjsize = object->size;
	object->size = nobjsize;

	/*
	 * File has shrunk. Toss any cached pages beyond the new EOF.
	 */
	if (nsize < vp->v_filesize) {
		vp->v_filesize = nsize;
		if (nobjsize < oobjsize) {
			vm_object_page_remove(object, nobjsize, oobjsize,
					      FALSE);
		}
		/*
		 * This gets rid of garbage at the end of a page that is now
		 * only partially backed by the vnode.  Since we are setting
		 * the entire page valid & clean after we are done we have
		 * to be sure that the portion of the page within the file
		 * bounds is already valid.  If it isn't then making it
		 * valid would create a corrupt block.
		 */
		if (nsize & PAGE_MASK) {
			vm_offset_t kva;
			vm_page_t m;

			m = vm_page_lookup_busy_wait(object, OFF_TO_IDX(nsize),
						     TRUE, "vsetsz");

			if (m && m->valid) {
				int base = (int)nsize & PAGE_MASK;
				int size = PAGE_SIZE - base;
				struct lwbuf *lwb;
				struct lwbuf lwb_cache;

				/*
				 * Clear out partial-page garbage in case
				 * the page has been mapped.
				 *
				 * This is byte aligned.
				 */
				lwb = lwbuf_alloc(m, &lwb_cache);
				kva = lwbuf_kva(lwb);
				bzero((caddr_t)kva + base, size);
				lwbuf_free(lwb);

				/*
				 * XXX work around SMP data integrity race
				 * by unmapping the page from user processes.
				 * The garbage we just cleared may be mapped
				 * to a user process running on another cpu
				 * and this code is not running through normal
				 * I/O channels which handle SMP issues for
				 * us, so unmap page to synchronize all cpus.
				 *
				 * XXX should vm_pager_unmap_page() have
				 * dealt with this?
				 */
				vm_page_protect(m, VM_PROT_NONE);

				/*
				 * Clear out partial-page dirty bits.  This
				 * has the side effect of setting the valid
				 * bits, but that is ok.  There are a bunch
				 * of places in the VM system where we expected
				 * m->dirty == VM_PAGE_BITS_ALL.  The file EOF
				 * case is one of them.  If the page is still
				 * partially dirty, make it fully dirty.
				 *
				 * NOTE: We do not clear out the valid
				 * bits.  This would prevent bogus_page
				 * replacement from working properly.
				 *
				 * NOTE: We do not want to clear the dirty
				 * bit for a partial DEV_BSIZE'd truncation!
				 * This is DEV_BSIZE aligned!
				 */
				vm_page_clear_dirty_beg_nonincl(m, base, size);
				if (m->dirty != 0)
					m->dirty = VM_PAGE_BITS_ALL;
				vm_page_wakeup(m);
			} else if (m) {
				vm_page_wakeup(m);
			}
		}
	} else {
		vp->v_filesize = nsize;
	}
	vm_object_drop(object);
}
/*
 * Allocate a VM object for a vnode, typically a regular file vnode.
 *
 * Some additional information is required to generate a properly sized
 * object which covers the entire buffer cache buffer straddling the file
 * EOF.  Userland does not see the extra pages as the VM fault code tests
 * against v_filesize.
 */
vm_object_t
vnode_pager_alloc(void *handle, off_t length, vm_prot_t prot, off_t offset,
		  int blksize, int boff)
{
	vm_object_t object;
	struct vnode *vp;
	off_t loffset;
	vm_pindex_t lsize;

	/*
	 * Pageout to vnode, no can do yet.
	 */
	if (handle == NULL)
		return (NULL);

	/*
	 * XXX hack - This initialization should be put somewhere else.
	 */
	if (vnode_pbuf_freecnt < 0) {
	    vnode_pbuf_freecnt = nswbuf / 2 + 1;
	}

	/*
	 * Serialize potential vnode/object teardowns and interlocks
	 */
	vp = (struct vnode *)handle;
	lwkt_gettoken(&vp->v_token);

	/*
	 * If the object is being terminated, wait for it to
	 * go away.
	 */
	object = vp->v_object;
	if (object) {
		vm_object_hold(object);
		KKASSERT((object->flags & OBJ_DEAD) == 0);
	}

	if (VREFCNT(vp) <= 0)
		panic("vnode_pager_alloc: no vnode reference");

	/*
	 * Round up to the *next* block, then destroy the buffers in question.
	 * Since we are only removing some of the buffers we must rely on the
	 * scan count to determine whether a loop is necessary.
	 *
	 * Destroy any pages beyond the last buffer.
	 */
	if (boff < 0)
		boff = (int)(length % blksize);
	if (boff)
		loffset = length + (blksize - boff);
	else
		loffset = length;
	lsize = OFF_TO_IDX(round_page64(loffset));

	if (object == NULL) {
		/*
		 * And an object of the appropriate size
		 */
		object = vm_object_allocate_hold(OBJT_VNODE, lsize);
		object->handle = handle;
		vp->v_object = object;
		vp->v_filesize = length;
		if (vp->v_mount && (vp->v_mount->mnt_kern_flag & MNTK_NOMSYNC))
			vm_object_set_flag(object, OBJ_NOMSYNC);
		vref(vp);
	} else {
		vm_object_reference_quick(object);	/* also vref's */
		if (object->size != lsize) {
			kprintf("vnode_pager_alloc: Warning, objsize "
				"mismatch %jd/%jd vp=%p obj=%p\n",
				(intmax_t)object->size,
				(intmax_t)lsize,
				vp, object);
		}
		if (vp->v_filesize != length) {
			kprintf("vnode_pager_alloc: Warning, filesize "
				"mismatch %jd/%jd vp=%p obj=%p\n",
				(intmax_t)vp->v_filesize,
				(intmax_t)length,
				vp, object);
		}
	}
	vm_object_drop(object);
	lwkt_reltoken(&vp->v_token);

	return (object);
}
예제 #10
0
/*
 * vm_contig_pg_kmap:
 *
 * Map previously allocated (vm_contig_pg_alloc) range of pages from
 * vm_page_array[] into the KVA.  Once mapped, the pages are part of
 * the Kernel, and are to free'ed with kmem_free(&kernel_map, addr, size).
 *
 * No requirements.
 */
vm_offset_t
vm_contig_pg_kmap(int start, u_long size, vm_map_t map, int flags)
{
	vm_offset_t addr, tmp_addr;
	vm_page_t pga = vm_page_array;
	int i, count;

	size = round_page(size);
	if (size == 0)
		panic("vm_contig_pg_kmap: size must not be 0");

	crit_enter();
	lwkt_gettoken(&vm_token);

	/*
	 * We've found a contiguous chunk that meets our requirements.
	 * Allocate KVM, and assign phys pages and return a kernel VM
	 * pointer.
	 */
	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
	vm_map_lock(map);
	if (vm_map_findspace(map, vm_map_min(map), size, PAGE_SIZE, 0, &addr) !=
	    KERN_SUCCESS) {
		/*
		 * XXX We almost never run out of kernel virtual
		 * space, so we don't make the allocated memory
		 * above available.
		 */
		vm_map_unlock(map);
		vm_map_entry_release(count);
		lwkt_reltoken(&vm_token);
		crit_exit();
		return (0);
	}

	/*
	 * kernel_object maps 1:1 to kernel_map.
	 */
	vm_object_hold(&kernel_object);
	vm_object_reference(&kernel_object);
	vm_map_insert(map, &count, 
		      &kernel_object, addr,
		      addr, addr + size,
		      VM_MAPTYPE_NORMAL,
		      VM_PROT_ALL, VM_PROT_ALL,
		      0);
	vm_map_unlock(map);
	vm_map_entry_release(count);

	tmp_addr = addr;
	for (i = start; i < (start + size / PAGE_SIZE); i++) {
		vm_page_t m = &pga[i];
		vm_page_insert(m, &kernel_object, OFF_TO_IDX(tmp_addr));
		if ((flags & M_ZERO) && !(m->flags & PG_ZERO))
			pmap_zero_page(VM_PAGE_TO_PHYS(m));
		m->flags = 0;
		tmp_addr += PAGE_SIZE;
 	}
	vm_map_wire(map, addr, addr + size, 0);

	vm_object_drop(&kernel_object);

	lwkt_reltoken(&vm_token);
	crit_exit();
	return (addr);
}
예제 #11
0
/*
 * vm_contig_pg_clean:
 *
 * Do a thorough cleanup of the specified 'queue', which can be either
 * PQ_ACTIVE or PQ_INACTIVE by doing a walkthrough.  If the page is not
 * marked dirty, it is shoved into the page cache, provided no one has
 * currently aqcuired it, otherwise localized action per object type
 * is taken for cleanup:
 *
 * 	In the OBJT_VNODE case, the whole page range is cleaned up
 * 	using the vm_object_page_clean() routine, by specyfing a
 * 	start and end of '0'.
 *
 * 	Otherwise if the object is of any other type, the generic
 * 	pageout (daemon) flush routine is invoked.
 */
static void
vm_contig_pg_clean(int queue, int count)
{
    vm_object_t object;
    vm_page_t m, m_tmp;
    struct vm_page marker;
    struct vpgqueues *pq = &vm_page_queues[queue];

    /*
     * Setup a local marker
     */
    bzero(&marker, sizeof(marker));
    marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
    marker.queue = queue;
    marker.wire_count = 1;

    vm_page_queues_spin_lock(queue);
    TAILQ_INSERT_HEAD(&pq->pl, &marker, pageq);
    vm_page_queues_spin_unlock(queue);

    /*
     * Iterate the queue.  Note that the vm_page spinlock must be
     * acquired before the pageq spinlock so it's easiest to simply
     * not hold it in the loop iteration.
     */
    while (count-- > 0 && (m = TAILQ_NEXT(&marker, pageq)) != NULL) {
        vm_page_and_queue_spin_lock(m);
        if (m != TAILQ_NEXT(&marker, pageq)) {
            vm_page_and_queue_spin_unlock(m);
            ++count;
            continue;
        }
        KKASSERT(m->queue == queue);

        TAILQ_REMOVE(&pq->pl, &marker, pageq);
        TAILQ_INSERT_AFTER(&pq->pl, m, &marker, pageq);

        if (m->flags & PG_MARKER) {
            vm_page_and_queue_spin_unlock(m);
            continue;
        }
        if (vm_page_busy_try(m, TRUE)) {
            vm_page_and_queue_spin_unlock(m);
            continue;
        }
        vm_page_and_queue_spin_unlock(m);

        /*
         * We've successfully busied the page
         */
        if (m->queue - m->pc != queue) {
            vm_page_wakeup(m);
            continue;
        }
        if (m->wire_count || m->hold_count) {
            vm_page_wakeup(m);
            continue;
        }
        if ((object = m->object) == NULL) {
            vm_page_wakeup(m);
            continue;
        }
        vm_page_test_dirty(m);
        if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
            vm_object_hold(object);
            KKASSERT(m->object == object);

            if (object->type == OBJT_VNODE) {
                vm_page_wakeup(m);
                vn_lock(object->handle, LK_EXCLUSIVE|LK_RETRY);
                vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
                vn_unlock(((struct vnode *)object->handle));
            } else if (object->type == OBJT_SWAP ||
                       object->type == OBJT_DEFAULT) {
                m_tmp = m;
                vm_pageout_flush(&m_tmp, 1, 0);
            } else {
                vm_page_wakeup(m);
            }
            vm_object_drop(object);
        } else if (m->hold_count == 0) {
            vm_page_cache(m);
        } else {
            vm_page_wakeup(m);
        }
    }

    /*
     * Scrap our local marker
     */
    vm_page_queues_spin_lock(queue);
    TAILQ_REMOVE(&pq->pl, &marker, pageq);
    vm_page_queues_spin_unlock(queue);
}
예제 #12
0
/*
 * A VFS can call this function to try to dispose of a read request
 * directly from the VM system, pretty much bypassing almost all VFS
 * overhead except for atime updates.
 *
 * If 0 is returned some or all of the uio was handled.  The caller must
 * check the uio and handle the remainder.
 *
 * The caller must fail on a non-zero error.
 */
int
vop_helper_read_shortcut(struct vop_read_args *ap)
{
	struct vnode *vp;
	struct uio *uio;
	struct lwbuf *lwb;
	struct lwbuf lwb_cache;
	vm_object_t obj;
	vm_page_t m;
	int offset;
	int n;
	int error;

	vp = ap->a_vp;
	uio = ap->a_uio;

	/*
	 * We can't short-cut if there is no VM object or this is a special
	 * UIO_NOCOPY read (typically from VOP_STRATEGY()).  We also can't
	 * do this if we cannot extract the filesize from the vnode.
	 */
	if (vm_read_shortcut_enable == 0)
		return(0);
	if (vp->v_object == NULL || uio->uio_segflg == UIO_NOCOPY)
		return(0);
	if (vp->v_filesize == NOOFFSET)
		return(0);
	if (uio->uio_resid == 0)
		return(0);

	/*
	 * Iterate the uio on a page-by-page basis
	 *
	 * XXX can we leave the object held shared during the uiomove()?
	 */
	++vm_read_shortcut_count;
	obj = vp->v_object;
	vm_object_hold_shared(obj);

	error = 0;
	while (uio->uio_resid && error == 0) {
		offset = (int)uio->uio_offset & PAGE_MASK;
		n = PAGE_SIZE - offset;
		if (n > uio->uio_resid)
			n = uio->uio_resid;
		if (vp->v_filesize < uio->uio_offset)
			break;
		if (uio->uio_offset + n > vp->v_filesize)
			n = vp->v_filesize - uio->uio_offset;
		if (n == 0)
			break;	/* hit EOF */

		m = vm_page_lookup_busy_try(obj, OFF_TO_IDX(uio->uio_offset),
					    FALSE, &error);
		if (error || m == NULL) {
			++vm_read_shortcut_failed;
			error = 0;
			break;
		}
		if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
			++vm_read_shortcut_failed;
			vm_page_wakeup(m);
			break;
		}
		lwb = lwbuf_alloc(m, &lwb_cache);

		/*
		 * Use a no-fault uiomove() to avoid deadlocking against
		 * our VM object (which could livelock on the same object
		 * due to shared-vs-exclusive), or deadlocking against
		 * our busied page.  Returns EFAULT on any fault which
		 * winds up diving a vnode.
		 */
		error = uiomove_nofault((char *)lwbuf_kva(lwb) + offset,
					n, uio);

		vm_page_flag_set(m, PG_REFERENCED);
		lwbuf_free(lwb);
		vm_page_wakeup(m);
	}
	vm_object_drop(obj);

	/*
	 * Ignore EFAULT since we used uiomove_nofault(), causes caller
	 * to fall-back to normal code for this case.
	 */
	if (error == EFAULT)
		error = 0;

	return (error);
}
예제 #13
0
static int
link_elf_load_file(const char* filename, linker_file_t* result)
{
    struct nlookupdata nd;
    struct thread *td = curthread;	/* XXX */
    struct proc *p = td->td_proc;
    struct vnode *vp;
    Elf_Ehdr *hdr;
    caddr_t firstpage;
    int nbytes, i;
    Elf_Phdr *phdr;
    Elf_Phdr *phlimit;
    Elf_Phdr *segs[2];
    int nsegs;
    Elf_Phdr *phdyn;
    Elf_Phdr *phphdr;
    caddr_t mapbase;
    size_t mapsize;
    Elf_Off base_offset;
    Elf_Addr base_vaddr;
    Elf_Addr base_vlimit;
    int error = 0;
    int resid;
    elf_file_t ef;
    linker_file_t lf;
    char *pathname;
    Elf_Shdr *shdr;
    int symtabindex;
    int symstrindex;
    int symcnt;
    int strcnt;

    /* XXX Hack for firmware loading where p == NULL */
    if (p == NULL) {
	p = &proc0;
    }

    KKASSERT(p != NULL);
    if (p->p_ucred == NULL) {
	kprintf("link_elf_load_file: cannot load '%s' from filesystem"
		" this early\n", filename);
	return ENOENT;
    }
    shdr = NULL;
    lf = NULL;
    pathname = linker_search_path(filename);
    if (pathname == NULL)
	return ENOENT;

    error = nlookup_init(&nd, pathname, UIO_SYSSPACE, NLC_FOLLOW|NLC_LOCKVP);
    if (error == 0)
	error = vn_open(&nd, NULL, FREAD, 0);
    kfree(pathname, M_LINKER);
    if (error) {
	nlookup_done(&nd);
	return error;
    }
    vp = nd.nl_open_vp;
    nd.nl_open_vp = NULL;
    nlookup_done(&nd);

    /*
     * Read the elf header from the file.
     */
    firstpage = kmalloc(PAGE_SIZE, M_LINKER, M_WAITOK);
    hdr = (Elf_Ehdr *)firstpage;
    error = vn_rdwr(UIO_READ, vp, firstpage, PAGE_SIZE, 0,
		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
    nbytes = PAGE_SIZE - resid;
    if (error)
	goto out;

    if (!IS_ELF(*hdr)) {
	error = ENOEXEC;
	goto out;
    }

    if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
      || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
	link_elf_error("Unsupported file layout");
	error = ENOEXEC;
	goto out;
    }
    if (hdr->e_ident[EI_VERSION] != EV_CURRENT
      || hdr->e_version != EV_CURRENT) {
	link_elf_error("Unsupported file version");
	error = ENOEXEC;
	goto out;
    }
    if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) {
	error = ENOSYS;
	goto out;
    }
    if (hdr->e_machine != ELF_TARG_MACH) {
	link_elf_error("Unsupported machine");
	error = ENOEXEC;
	goto out;
    }

    /*
     * We rely on the program header being in the first page.  This is
     * not strictly required by the ABI specification, but it seems to
     * always true in practice.  And, it simplifies things considerably.
     */
    if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) &&
	  (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) &&
	  (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes)))
	link_elf_error("Unreadable program headers");

    /*
     * Scan the program header entries, and save key information.
     *
     * We rely on there being exactly two load segments, text and data,
     * in that order.
     */
    phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff);
    phlimit = phdr + hdr->e_phnum;
    nsegs = 0;
    phdyn = NULL;
    phphdr = NULL;
    while (phdr < phlimit) {
	switch (phdr->p_type) {

	case PT_LOAD:
	    if (nsegs == 2) {
		link_elf_error("Too many sections");
		error = ENOEXEC;
		goto out;
	    }
	    segs[nsegs] = phdr;
	    ++nsegs;
	    break;

	case PT_PHDR:
	    phphdr = phdr;
	    break;

	case PT_DYNAMIC:
	    phdyn = phdr;
	    break;

	case PT_INTERP:
	    error = ENOSYS;
	    goto out;
	}

	++phdr;
    }
    if (phdyn == NULL) {
	link_elf_error("Object is not dynamically-linked");
	error = ENOEXEC;
	goto out;
    }

    /*
     * Allocate the entire address space of the object, to stake out our
     * contiguous region, and to establish the base address for relocation.
     */
    base_offset = trunc_page(segs[0]->p_offset);
    base_vaddr = trunc_page(segs[0]->p_vaddr);
    base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz);
    mapsize = base_vlimit - base_vaddr;

    ef = kmalloc(sizeof(struct elf_file), M_LINKER, M_WAITOK | M_ZERO);
#ifdef SPARSE_MAPPING
    ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT);
    if (ef->object == NULL) {
	kfree(ef, M_LINKER);
	error = ENOMEM;
	goto out;
    }
    vm_object_hold(ef->object);
    vm_object_reference_locked(ef->object);
    ef->address = (caddr_t)vm_map_min(&kernel_map);
    error = vm_map_find(&kernel_map, ef->object, 0,
			(vm_offset_t *)&ef->address,
			mapsize, PAGE_SIZE,
			1, VM_MAPTYPE_NORMAL,
			VM_PROT_ALL, VM_PROT_ALL,
			0);
    vm_object_drop(ef->object);
    if (error) {
	vm_object_deallocate(ef->object);
	kfree(ef, M_LINKER);
	goto out;
    }
#else
    ef->address = kmalloc(mapsize, M_LINKER, M_WAITOK);
#endif
    mapbase = ef->address;

    /*
     * Read the text and data sections and zero the bss.
     */
    for (i = 0; i < 2; i++) {
	caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
	error = vn_rdwr(UIO_READ, vp,
			segbase, segs[i]->p_filesz, segs[i]->p_offset,
			UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
	if (error) {
#ifdef SPARSE_MAPPING
	    vm_map_remove(&kernel_map, (vm_offset_t) ef->address,
			  (vm_offset_t) ef->address
			  + (ef->object->size << PAGE_SHIFT));
	    vm_object_deallocate(ef->object);
#else
	    kfree(ef->address, M_LINKER);
#endif
	    kfree(ef, M_LINKER);
	    goto out;
	}
	bzero(segbase + segs[i]->p_filesz,
	      segs[i]->p_memsz - segs[i]->p_filesz);

#ifdef SPARSE_MAPPING
	/*
	 * Wire down the pages
	 */
	vm_map_wire(&kernel_map,
		    (vm_offset_t) segbase,
		    (vm_offset_t) segbase + segs[i]->p_memsz,
		    0);
#endif
    }

    ef->dynamic = (const Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);

    lf = linker_make_file(filename, ef, &link_elf_file_ops);
    if (lf == NULL) {
#ifdef SPARSE_MAPPING
	vm_map_remove(&kernel_map, (vm_offset_t) ef->address,
		      (vm_offset_t) ef->address
		      + (ef->object->size << PAGE_SHIFT));
	vm_object_deallocate(ef->object);
#else
	kfree(ef->address, M_LINKER);
#endif
	kfree(ef, M_LINKER);
	error = ENOMEM;
	goto out;
    }
    lf->address = ef->address;
    lf->size = mapsize;

    error = parse_dynamic(lf);
    if (error)
	goto out;
    link_elf_reloc_local(lf);
    error = linker_load_dependencies(lf);
    if (error)
	goto out;
    error = relocate_file(lf);
    if (error)
	goto out;

    /* Try and load the symbol table if it's present.  (you can strip it!) */
    nbytes = hdr->e_shnum * hdr->e_shentsize;
    if (nbytes == 0 || hdr->e_shoff == 0)
	goto nosyms;
    shdr = kmalloc(nbytes, M_LINKER, M_WAITOK | M_ZERO);
    error = vn_rdwr(UIO_READ, vp,
		    (caddr_t)shdr, nbytes, hdr->e_shoff,
		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
    if (error)
	goto out;
    symtabindex = -1;
    symstrindex = -1;
    for (i = 0; i < hdr->e_shnum; i++) {
	if (shdr[i].sh_type == SHT_SYMTAB) {
	    symtabindex = i;
	    symstrindex = shdr[i].sh_link;
	}
    }
    if (symtabindex < 0 || symstrindex < 0)
	goto nosyms;

    symcnt = shdr[symtabindex].sh_size;
    ef->symbase = kmalloc(symcnt, M_LINKER, M_WAITOK);
    strcnt = shdr[symstrindex].sh_size;
    ef->strbase = kmalloc(strcnt, M_LINKER, M_WAITOK);
    error = vn_rdwr(UIO_READ, vp,
		    ef->symbase, symcnt, shdr[symtabindex].sh_offset,
		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
    if (error)
	goto out;
    error = vn_rdwr(UIO_READ, vp,
		    ef->strbase, strcnt, shdr[symstrindex].sh_offset,
		    UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid);
    if (error)
	goto out;

    ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
    ef->ddbsymtab = (const Elf_Sym *)ef->symbase;
    ef->ddbstrcnt = strcnt;
    ef->ddbstrtab = ef->strbase;

nosyms:

    *result = lf;

out:
    if (error && lf)
	linker_file_unload(lf);
    if (shdr)
	kfree(shdr, M_LINKER);
    if (firstpage)
	kfree(firstpage, M_LINKER);
    vn_unlock(vp);
    vn_close(vp, FREAD);

    return error;
}