Ejemplo n.º 1
0
static int
null_vptocnp(struct vop_vptocnp_args *ap)
{
	struct vnode *vp = ap->a_vp;
	struct vnode **dvp = ap->a_vpp;
	struct vnode *lvp, *ldvp;
	struct ucred *cred = ap->a_cred;
	int error, locked;

	if (vp->v_type == VDIR)
		return (vop_stdvptocnp(ap));

	locked = VOP_ISLOCKED(vp);
	lvp = NULLVPTOLOWERVP(vp);
	vhold(lvp);
	VOP_UNLOCK(vp, 0); /* vp is held by vn_vptocnp_locked that called us */
	ldvp = lvp;
	error = vn_vptocnp(&ldvp, cred, ap->a_buf, ap->a_buflen);
	vdrop(lvp);
	if (error != 0) {
		vn_lock(vp, locked | LK_RETRY);
		return (ENOENT);
	}

	/*
	 * Exclusive lock is required by insmntque1 call in
	 * null_nodeget()
	 */
	error = vn_lock(ldvp, LK_EXCLUSIVE);
	if (error != 0) {
		vn_lock(vp, locked | LK_RETRY);
		vdrop(ldvp);
		return (ENOENT);
	}
	vref(ldvp);
	vdrop(ldvp);
	error = null_nodeget(vp->v_mount, ldvp, dvp);
	if (error == 0) {
#ifdef DIAGNOSTIC
		NULLVPTOLOWERVP(*dvp);
#endif
		vhold(*dvp);
		vput(*dvp);
	} else
		vput(ldvp);

	vn_lock(vp, locked | LK_RETRY);
	return (error);
}
Ejemplo n.º 2
0
/*ARGSUSED*/
static void
znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
{
#if 1	/* XXXPJD: From OpenSolaris. */
	/*
	 * We should never drop all dbuf refs without first clearing
	 * the eviction callback.
	 */
	panic("evicting znode %p\n", user_ptr);
#else	/* XXXPJD */
	znode_t *zp = user_ptr;
	vnode_t *vp;

	mutex_enter(&zp->z_lock);
	zp->z_dbuf = NULL;
	vp = ZTOV(zp);
	if (vp == NULL) {
		mutex_exit(&zp->z_lock);
		zfs_znode_free(zp);
	} else if (vp->v_count == 0) {
		zp->z_vnode = NULL;
		vhold(vp);
		mutex_exit(&zp->z_lock);
		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
		vrecycle(vp, curthread);
		VOP_UNLOCK(vp, 0, curthread);
		vdrop(vp);
		zfs_znode_free(zp);
	} else {
		mutex_exit(&zp->z_lock);
	}
#endif
}
Ejemplo n.º 3
0
/*
 * Purge the cache of dead entries
 *
 * This is extremely inefficient due to the fact that vgone() not only
 * indirectly modifies the vnode cache, but may also sleep.  We can
 * neither hold pfs_vncache_mutex across a vgone() call, nor make any
 * assumptions about the state of the cache after vgone() returns.  In
 * consequence, we must start over after every vgone() call, and keep
 * trying until we manage to traverse the entire cache.
 *
 * The only way to improve this situation is to change the data structure
 * used to implement the cache.
 */
static void
pfs_purge_locked(struct pfs_node *pn, bool force)
{
	struct pfs_vdata *pvd;
	struct vnode *vnp;

	mtx_assert(&pfs_vncache_mutex, MA_OWNED);
	pvd = pfs_vncache;
	while (pvd != NULL) {
		if (force || pvd->pvd_dead ||
		    (pn != NULL && pvd->pvd_pn == pn)) {
			vnp = pvd->pvd_vnode;
			vhold(vnp);
			mtx_unlock(&pfs_vncache_mutex);
			VOP_LOCK(vnp, LK_EXCLUSIVE);
			vgone(vnp);
			VOP_UNLOCK(vnp, 0);
			mtx_lock(&pfs_vncache_mutex);
			vdrop(vnp);
			pvd = pfs_vncache;
		} else {
			pvd = pvd->pvd_next;
		}
	}
}
Ejemplo n.º 4
0
/*
 * This opens /dev/tty.  Because multiple opens of /dev/tty only
 * generate a single open to the actual tty, the file modes are
 * locked to FREAD|FWRITE.
 */
static	int
cttyopen(struct dev_open_args *ap)
{
	struct proc *p = curproc;
	struct vnode *ttyvp;
	int error;

	KKASSERT(p);
retry:
	if ((ttyvp = cttyvp(p)) == NULL)
		return (ENXIO);
	if (ttyvp->v_flag & VCTTYISOPEN)
		return (0);

	/*
	 * Messy interlock, don't let the vnode go away while we try to
	 * lock it and check for race after we might have blocked.
	 */
	vhold(ttyvp);
	vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY);
	if (ttyvp != cttyvp(p) || (ttyvp->v_flag & VCTTYISOPEN)) {
		kprintf("Warning: cttyopen: race avoided\n");
		vn_unlock(ttyvp);
		vdrop(ttyvp);
		goto retry;
	}
	vsetflags(ttyvp, VCTTYISOPEN);
	error = VOP_OPEN(ttyvp, FREAD|FWRITE, ap->a_cred, NULL);
	if (error)
		vclrflags(ttyvp, VCTTYISOPEN);
	vn_unlock(ttyvp);
	vdrop(ttyvp);
	return(error);
}
Ejemplo n.º 5
0
/*
 * Read a symbolic link
 */
static int
pfs_readlink(struct vop_readlink_args *va)
{
	struct vnode *vn = va->a_vp;
	struct pfs_vdata *pvd = vn->v_data;
	struct pfs_node *pn = pvd->pvd_pn;
	struct uio *uio = va->a_uio;
	struct proc *proc = NULL;
	struct thread *td = curthread;
	char buf[PATH_MAX];
	struct sbuf sb;
	int error, locked;

	PFS_TRACE(("%s", pn->pn_name));
	pfs_assert_not_owned(pn);

	if (vn->v_type != VLNK)
		PFS_RETURN (EINVAL);
	KASSERT_PN_IS_LINK(pn);

	if (pn->pn_fill == NULL)
		PFS_RETURN (EIO);

	if (pvd->pvd_pid != NO_PID) {
		if ((proc = pfind(pvd->pvd_pid)) == NULL)
			PFS_RETURN (EIO);
		if (proc->p_flag & P_WEXIT) {
			PROC_UNLOCK(proc);
			PFS_RETURN (EIO);
		}
		_PHOLD(proc);
		PROC_UNLOCK(proc);
	}
	vhold(vn);
	locked = VOP_ISLOCKED(vn, td);
	VOP_UNLOCK(vn, 0, td);

	/* sbuf_new() can't fail with a static buffer */
	sbuf_new(&sb, buf, sizeof buf, 0);

	error = pn_fill(td, proc, pn, &sb, NULL);

	if (proc != NULL)
		PRELE(proc);
	vn_lock(vn, locked | LK_RETRY, td);
	vdrop(vn);

	if (error) {
		sbuf_delete(&sb);
		PFS_RETURN (error);
	}

	sbuf_finish(&sb);
	error = uiomove_frombuf(sbuf_data(&sb), sbuf_len(&sb), uio);
	sbuf_delete(&sb);
	PFS_RETURN (error);
}
Ejemplo n.º 6
0
/**
 * ntfs_usnjrnl_stamp - stamp the transaction log ($UsnJrnl) on an ntfs volume
 * @vol:	ntfs volume on which to stamp the transaction log
 *
 * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return 0
 * on success and errno on error.
 *
 * This function assumes that the transaction log has already been loaded and
 * consistency checked by a call to ntfs_vfsops.c::ntfs_usnjrnl_load().
 */
errno_t ntfs_usnjrnl_stamp(ntfs_volume *vol)
{
	ntfs_debug("Entering.");
	if (!NVolUsnJrnlStamped(vol)) {
		sle64 j_size, stamp;
		upl_t upl;
		upl_page_info_array_t pl;
		USN_HEADER *uh;
		ntfs_inode *max_ni;
		errno_t err;

		mtx_lock_spin(&vol->usnjrnl_j_ni->size_lock);
		j_size = vol->usnjrnl_j_ni->data_size;
		mtx_unlock_spin(&vol->usnjrnl_j_ni->size_lock);
		max_ni = vol->usnjrnl_max_ni;
		/*
		 * FIXME: Next If statement always false because of
		 * replacing vnode_get() with vhold()
		 */
		vhold(max_ni->vn);
		if (0) {
			ntfs_error(vol->mp, "Failed to get vnode for "
					"$UsnJrnl/$DATA/$Max.");
			return err;
		}
		sx_slock(&max_ni->lock);
		err = ntfs_page_map(max_ni, 0, &upl, &pl, (u8**)&uh, TRUE);
		if (err) {
			ntfs_error(vol->mp, "Failed to read from "
					"$UsnJrnl/$DATA/$Max attribute.");
			vdrop(max_ni->vn);
			return err;
		}
		stamp = ntfs_current_time();
		ntfs_debug("Stamping transaction log ($UsnJrnl): old "
				"journal_id 0x%llx, old lowest_valid_usn "
				"0x%llx, new journal_id 0x%llx, new "
				"lowest_valid_usn 0x%llx.",
				(unsigned long long)
				sle64_to_cpu(uh->journal_id),
				(unsigned long long)
				sle64_to_cpu(uh->lowest_valid_usn),
				(unsigned long long)sle64_to_cpu(stamp),
				(unsigned long long)j_size);
		uh->lowest_valid_usn = cpu_to_sle64(j_size);
		uh->journal_id = stamp;
		ntfs_page_unmap(max_ni, upl, pl, TRUE);
		sx_sunlock(&max_ni->lock);
		vdrop(max_ni->vn);
		/* Set the flag so we do not have to do it again on remount. */
		NVolSetUsnJrnlStamped(vol);
		// TODO: Should we mark any times on the base inode $UsnJrnl
		// for update here?
	}
	ntfs_debug("Done.");
	return 0;
}
Ejemplo n.º 7
0
/*
 * This opens /dev/tty.  Because multiple opens of /dev/tty only
 * generate a single open to the actual tty, the file modes are
 * locked to FREAD|FWRITE.
 */
static	int
cttyopen(struct dev_open_args *ap)
{
	struct proc *p = curproc;
	struct vnode *ttyvp;
	int error;

	KKASSERT(p);
retry:
	if ((ttyvp = cttyvp(p)) == NULL)
		return (ENXIO);
	if (ttyvp->v_flag & VCTTYISOPEN)
		return (0);

	/*
	 * Messy interlock, don't let the vnode go away while we try to
	 * lock it and check for race after we might have blocked.
	 *
	 * WARNING! The device open (devfs_spec_open()) temporarily
	 *	    releases the vnode lock on ttyvp when issuing the
	 *	    dev_dopen(), which means that the VCTTYISOPEn flag
	 *	    can race during the VOP_OPEN().
	 *
	 *	    If something does race we have to undo our potentially
	 *	    extra open.
	 */
	vhold(ttyvp);
	vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY);
	if (ttyvp != cttyvp(p) || (ttyvp->v_flag & VCTTYISOPEN)) {
		kprintf("Warning: cttyopen: race-1 avoided\n");
		vn_unlock(ttyvp);
		vdrop(ttyvp);
		goto retry;
	}
	error = VOP_OPEN(ttyvp, FREAD|FWRITE, ap->a_cred, NULL);

	/*
	 * Race against ctty close or change.  This case has been validated
	 * and occurs every so often during synth builds.
	 */
	if (ttyvp != cttyvp(p) || (ttyvp->v_flag & VCTTYISOPEN)) {
		if (error == 0)
			VOP_CLOSE(ttyvp, FREAD|FWRITE, NULL);
		vn_unlock(ttyvp);
		vdrop(ttyvp);
		goto retry;
	}
	if (error == 0)
		vsetflags(ttyvp, VCTTYISOPEN);
	vn_unlock(ttyvp);
	vdrop(ttyvp);
	return(error);
}
static int
udf_write_logvol_dscr_seq(struct udf_strat_args *args)
{
	union dscrptr    *dscr     = args->dscr;
	struct udf_mount *ump      = args->ump;
	struct udf_node  *udf_node = args->udf_node;
	struct long_ad   *icb      = args->icb;
	int               waitfor  = args->waitfor;
	uint32_t logsectornr, sectornr, dummy;
	int error, vpart;

	/*
	 * we have to decide if we write it out sequential or at its fixed 
	 * position by examining the partition its (to be) written on.
	 */
	vpart       = udf_rw16(udf_node->loc.loc.part_num);
	logsectornr = udf_rw32(icb->loc.lb_num);
	sectornr    = 0;
	if (ump->vtop_tp[vpart] != UDF_VTOP_TYPE_VIRT) {
		error = udf_translate_vtop(ump, icb, &sectornr, &dummy);
		if (error)
			goto out;
	}

	/* add reference to the vnode to prevent recycling */
	vhold(udf_node->vnode);

	if (waitfor) {
		DPRINTF(WRITE, ("udf_write_logvol_dscr: sync write\n"));

		error = udf_write_phys_dscr_sync(ump, udf_node, UDF_C_NODE,
			dscr, sectornr, logsectornr);
	} else {
		DPRINTF(WRITE, ("udf_write_logvol_dscr: no wait, async write\n"));

		error = udf_write_phys_dscr_async(ump, udf_node, UDF_C_NODE,
			dscr, sectornr, logsectornr, udf_wr_nodedscr_callback);
		/* will be UNLOCKED in call back */
		return error;
	}

	holdrele(udf_node->vnode);
out:
	udf_node->outstanding_nodedscr--;
	if (udf_node->outstanding_nodedscr == 0) {
		UDF_UNLOCK_NODE(udf_node, 0);
		wakeup(&udf_node->outstanding_nodedscr);
	}

	return error;
}
Ejemplo n.º 9
0
void
vnode_pager_release_writecount(vm_object_t object, vm_offset_t start,
    vm_offset_t end)
{
	struct vnode *vp;
	struct mount *mp;
	vm_offset_t inc;

	VM_OBJECT_WLOCK(object);

	/*
	 * First, recheck the object type to account for the race when
	 * the vnode is reclaimed.
	 */
	if (object->type != OBJT_VNODE) {
		VM_OBJECT_WUNLOCK(object);
		return;
	}

	/*
	 * Optimize for the case when writemappings is not going to
	 * zero.
	 */
	inc = end - start;
	if (object->un_pager.vnp.writemappings != inc) {
		object->un_pager.vnp.writemappings -= inc;
		VM_OBJECT_WUNLOCK(object);
		return;
	}

	vp = object->handle;
	vhold(vp);
	VM_OBJECT_WUNLOCK(object);
	mp = NULL;
	vn_start_write(vp, &mp, V_WAIT);
	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);

	/*
	 * Decrement the object's writemappings, by swapping the start
	 * and end arguments for vnode_pager_update_writecount().  If
	 * there was not a race with vnode reclaimation, then the
	 * vnode's v_writecount is decremented.
	 */
	vnode_pager_update_writecount(object, end, start);
	VOP_UNLOCK(vp, 0);
	vdrop(vp);
	if (mp != NULL)
		vn_finished_write(mp);
}
Ejemplo n.º 10
0
/*
 * Filler function for proc/pid/self
 */
int
procfs_doprocfile(PFS_FILL_ARGS)
{
	char *fullpath;
	char *freepath;
	struct vnode *textvp;
	int error;

	freepath = NULL;
	PROC_LOCK(p);
	textvp = p->p_textvp;
	vhold(textvp);
	PROC_UNLOCK(p);
	error = vn_fullpath(td, textvp, &fullpath, &freepath);
	vdrop(textvp);
	if (error == 0)
		sbuf_printf(sb, "%s", fullpath);
	if (freepath != NULL)
		free(freepath, M_TEMP);
	return (error);
}
Ejemplo n.º 11
0
static int
ufs_lookup_upgrade_lock(struct vnode *vp)
{
	int error;

	ASSERT_VOP_LOCKED(vp, __FUNCTION__);
	if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
		return (0);

	error = 0;

	/*
	 * Upgrade vnode lock, since getinoquota()
	 * requires exclusive lock to modify inode.
	 */
	vhold(vp);
	vn_lock(vp, LK_UPGRADE | LK_RETRY);
	VI_LOCK(vp);
	if (vp->v_iflag & VI_DOOMED)
		error = ENOENT;
	vdropl(vp);
	return (error);
}
Ejemplo n.º 12
0
/*
 * Read from a file
 */
static int
pfs_read(struct vop_read_args *va)
{
	struct vnode *vn = va->a_vp;
	struct pfs_vdata *pvd = vn->v_data;
	struct pfs_node *pn = pvd->pvd_pn;
	struct uio *uio = va->a_uio;
	struct proc *proc;
	struct sbuf *sb = NULL;
	int error, locked;
	unsigned int buflen, offset, resid;

	PFS_TRACE(("%s", pn->pn_name));
	pfs_assert_not_owned(pn);

	if (vn->v_type != VREG)
		PFS_RETURN (EINVAL);
	KASSERT_PN_IS_FILE(pn);

	if (!(pn->pn_flags & PFS_RD))
		PFS_RETURN (EBADF);

	if (pn->pn_fill == NULL)
		PFS_RETURN (EIO);

	/*
	 * This is necessary because either process' privileges may
	 * have changed since the open() call.
	 */
	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc))
		PFS_RETURN (EIO);
	if (proc != NULL) {
		_PHOLD(proc);
		PROC_UNLOCK(proc);
	}

	vhold(vn);
	locked = VOP_ISLOCKED(vn, curthread);
	VOP_UNLOCK(vn, 0, curthread);

	if (pn->pn_flags & PFS_RAWRD) {
		PFS_TRACE(("%lu resid", (unsigned long)uio->uio_resid));
		error = pn_fill(curthread, proc, pn, NULL, uio);
		PFS_TRACE(("%lu resid", (unsigned long)uio->uio_resid));
		goto ret;
	}

	/* beaucoup sanity checks so we don't ask for bogus allocation */
	if (uio->uio_offset < 0 || uio->uio_resid < 0 ||
	    (offset = uio->uio_offset) != uio->uio_offset ||
	    (resid = uio->uio_resid) != uio->uio_resid ||
	    (buflen = offset + resid + 1) < offset || buflen > INT_MAX) {
		if (proc != NULL)
			PRELE(proc);
		error = EINVAL;
		goto ret;
	}
	if (buflen > MAXPHYS + 1) {
		error = EIO;
		goto ret;
	}

	sb = sbuf_new(sb, NULL, buflen, 0);
	if (sb == NULL) {
		error = EIO;
		goto ret;
	}

	error = pn_fill(curthread, proc, pn, sb, uio);

	if (error) {
		sbuf_delete(sb);
		goto ret;
	}

	sbuf_finish(sb);
	error = uiomove_frombuf(sbuf_data(sb), sbuf_len(sb), uio);
	sbuf_delete(sb);
ret:
	vn_lock(vn, locked | LK_RETRY, curthread);
	vdrop(vn);
	if (proc != NULL)
		PRELE(proc);
	PFS_RETURN (error);
}
Ejemplo n.º 13
0
/*
 * Allocates a new vnode for the node node or returns a new reference to
 * an existing one if the node had already a vnode referencing it.  The
 * resulting locked vnode is returned in *vpp.
 *
 * Returns zero on success or an appropriate error code on failure.
 *
 * The caller must ensure that node cannot go away (usually by holding
 * the related directory entry).
 *
 * If dnode is non-NULL this routine avoids deadlocking against it but
 * can return EAGAIN.  Caller must try again.  The dnode lock will cycle
 * in this case, it remains locked on return in all cases.  dnode must
 * be shared-locked.
 */
int
tmpfs_alloc_vp(struct mount *mp,
	       struct tmpfs_node *dnode, struct tmpfs_node *node, int lkflag,
	       struct vnode **vpp)
{
	int error = 0;
	struct vnode *vp;

loop:
	/*
	 * Interlocked extraction from node.  This can race many things.
	 * We have to get a soft reference on the vnode while we hold
	 * the node locked, then acquire it properly and check for races.
	 */
	TMPFS_NODE_LOCK(node);
	if ((vp = node->tn_vnode) != NULL) {
		KKASSERT((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0);
		vhold(vp);
		TMPFS_NODE_UNLOCK(node);

		if (dnode) {
			/*
			 * Special-case handling to avoid deadlocking against
			 * dnode.  This case has been validated and occurs
			 * every so often during synth builds.
			 */
			if (vget(vp, (lkflag & ~LK_RETRY) |
				     LK_NOWAIT |
				     LK_EXCLUSIVE) != 0) {
				TMPFS_NODE_UNLOCK(dnode);
				if (vget(vp, (lkflag & ~LK_RETRY) |
					     LK_SLEEPFAIL |
					     LK_EXCLUSIVE) == 0) {
					vn_unlock(vp);
				}
				vdrop(vp);
				TMPFS_NODE_LOCK_SH(dnode);

				return EAGAIN;
			}
		} else {
			/*
			 * Normal path
			 */
			if (vget(vp, lkflag | LK_EXCLUSIVE) != 0) {
				vdrop(vp);
				goto loop;
			}
		}
		if (node->tn_vnode != vp) {
			vput(vp);
			vdrop(vp);
			goto loop;
		}
		vdrop(vp);
		goto out;
	}
	/* vp is NULL */

	/*
	 * This should never happen.
	 */
	if (node->tn_vpstate & TMPFS_VNODE_DOOMED) {
		TMPFS_NODE_UNLOCK(node);
		error = ENOENT;
		goto out;
	}

	/*
	 * Interlock against other calls to tmpfs_alloc_vp() trying to
	 * allocate and assign a vp to node.
	 */
	if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) {
		node->tn_vpstate |= TMPFS_VNODE_WANT;
		error = tsleep(&node->tn_vpstate, PINTERLOCKED | PCATCH,
			       "tmpfs_alloc_vp", 0);
		TMPFS_NODE_UNLOCK(node);
		if (error)
			return error;
		goto loop;
	}
	node->tn_vpstate |= TMPFS_VNODE_ALLOCATING;
	TMPFS_NODE_UNLOCK(node);

	/*
	 * Allocate a new vnode (may block).  The ALLOCATING flag should
	 * prevent a race against someone else assigning node->tn_vnode.
	 */
	error = getnewvnode(VT_TMPFS, mp, &vp, VLKTIMEOUT, LK_CANRECURSE);
	if (error != 0)
		goto unlock;

	KKASSERT(node->tn_vnode == NULL);
	KKASSERT(vp != NULL);
	vp->v_data = node;
	vp->v_type = node->tn_type;

	/* Type-specific initialization. */
	switch (node->tn_type) {
	case VBLK:
		/* FALLTHROUGH */
	case VCHR:
		/* FALLTHROUGH */
	case VSOCK:
		break;
	case VREG:
		/*
		 * VMIO is mandatory.  Tmpfs also supports KVABIO
		 * for its tmpfs_strategy().
		 */
		vsetflags(vp, VKVABIO);
		vinitvmio(vp, node->tn_size, TMPFS_BLKSIZE, -1);
		break;
	case VLNK:
		break;
	case VFIFO:
		vp->v_ops = &mp->mnt_vn_fifo_ops;
		break;
	case VDIR:
		break;

	default:
		panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type);
	}


unlock:
	TMPFS_NODE_LOCK(node);

	KKASSERT(node->tn_vpstate & TMPFS_VNODE_ALLOCATING);
	node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING;
	node->tn_vnode = vp;

	if (node->tn_vpstate & TMPFS_VNODE_WANT) {
		node->tn_vpstate &= ~TMPFS_VNODE_WANT;
		TMPFS_NODE_UNLOCK(node);
		wakeup(&node->tn_vpstate);
	} else {
		TMPFS_NODE_UNLOCK(node);
	}

out:
	*vpp = vp;
	KKASSERT(IFF(error == 0, *vpp != NULL && vn_islocked(*vpp)));

	return error;
}
Ejemplo n.º 14
0
/*
 * Do a generic nlookup.  Note that the passed nd is not nlookup_done()'d
 * on return, even if an error occurs.  If no error occurs or NLC_CREATE
 * is flagged and ENOENT is returned, then the returned nl_nch is always
 * referenced and locked exclusively.
 *
 * WARNING: For any general error other than ENOENT w/NLC_CREATE, the
 *	    the resulting nl_nch may or may not be locked and if locked
 *	    might be locked either shared or exclusive.
 *
 * Intermediate directory elements, including the current directory, require
 * execute (search) permission.  nlookup does not examine the access 
 * permissions on the returned element.
 *
 * If NLC_CREATE is set the last directory must allow node creation,
 * and an error code of 0 will be returned for a non-existant
 * target (not ENOENT).
 *
 * If NLC_RENAME_DST is set the last directory mut allow node deletion,
 * plus the sticky check is made, and an error code of 0 will be returned
 * for a non-existant target (not ENOENT).
 *
 * If NLC_DELETE is set the last directory mut allow node deletion,
 * plus the sticky check is made.
 *
 * If NLC_REFDVP is set nd->nl_dvp will be set to the directory vnode
 * of the returned entry.  The vnode will be referenced, but not locked,
 * and will be released by nlookup_done() along with everything else.
 *
 * NOTE: As an optimization we attempt to obtain a shared namecache lock
 *	 on any intermediate elements.  On success, the returned element
 *	 is ALWAYS locked exclusively.
 */
int
nlookup(struct nlookupdata *nd)
{
    globaldata_t gd = mycpu;
    struct nlcomponent nlc;
    struct nchandle nch;
    struct nchandle par;
    struct nchandle nctmp;
    struct mount *mp;
    struct vnode *hvp;		/* hold to prevent recyclement */
    int wasdotordotdot;
    char *ptr;
    char *nptr;
    int error;
    int len;
    int dflags;
    int hit = 1;
    int saveflag = nd->nl_flags & ~NLC_NCDIR;
    boolean_t doretry = FALSE;
    boolean_t inretry = FALSE;

nlookup_start:
#ifdef KTRACE
    if (KTRPOINT(nd->nl_td, KTR_NAMEI))
	ktrnamei(nd->nl_td->td_lwp, nd->nl_path);
#endif
    bzero(&nlc, sizeof(nlc));

    /*
     * Setup for the loop.  The current working namecache element is
     * always at least referenced.  We lock it as required, but always
     * return a locked, resolved namecache entry.
     */
    nd->nl_loopcnt = 0;
    if (nd->nl_dvp) {
	vrele(nd->nl_dvp);
	nd->nl_dvp = NULL;
    }
    ptr = nd->nl_path;

    /*
     * Loop on the path components.  At the top of the loop nd->nl_nch
     * is ref'd and unlocked and represents our current position.
     */
    for (;;) {
	/*
	 * Make sure nl_nch is locked so we can access the vnode, resolution
	 * state, etc.
	 */
	if ((nd->nl_flags & NLC_NCPISLOCKED) == 0) {
		nd->nl_flags |= NLC_NCPISLOCKED;
		cache_lock_maybe_shared(&nd->nl_nch, wantsexcllock(nd, ptr));
	}

	/*
	 * Check if the root directory should replace the current
	 * directory.  This is done at the start of a translation
	 * or after a symbolic link has been found.  In other cases
	 * ptr will never be pointing at a '/'.
	 */
	if (*ptr == '/') {
	    do {
		++ptr;
	    } while (*ptr == '/');
	    cache_unlock(&nd->nl_nch);
	    cache_get_maybe_shared(&nd->nl_rootnch, &nch,
				   wantsexcllock(nd, ptr));
	    if (nd->nl_flags & NLC_NCDIR) {
		    cache_drop_ncdir(&nd->nl_nch);
		    nd->nl_flags &= ~NLC_NCDIR;
	    } else {
		    cache_drop(&nd->nl_nch);
	    }
	    nd->nl_nch = nch;		/* remains locked */

	    /*
	     * Fast-track termination.  There is no parent directory of
	     * the root in the same mount from the point of view of
	     * the caller so return EACCES if NLC_REFDVP is specified,
	     * and EEXIST if NLC_CREATE is also specified.
	     * e.g. 'rmdir /' or 'mkdir /' are not allowed.
	     */
	    if (*ptr == 0) {
		if (nd->nl_flags & NLC_REFDVP)
			error = (nd->nl_flags & NLC_CREATE) ? EEXIST : EACCES;
		else
			error = 0;
		break;
	    }
	    continue;
	}

	/*
	 * Pre-calculate next path component so we can check whether the
	 * current component directory is the last directory in the path
	 * or not.
	 */
	for (nptr = ptr; *nptr && *nptr != '/'; ++nptr)
		;

	/*
	 * Check directory search permissions (nd->nl_nch is locked & refd).
	 * This will load dflags to obtain directory-special permissions to
	 * be checked along with the last component.
	 *
	 * We only need to pass-in &dflags for the second-to-last component.
	 * Optimize by passing-in NULL for any prior components, which may
	 * allow the code to bypass the naccess() call.
	 */
	dflags = 0;
	if (*nptr == '/')
	    error = naccess(&nd->nl_nch, NLC_EXEC, nd->nl_cred, NULL);
	else
	    error = naccess(&nd->nl_nch, NLC_EXEC, nd->nl_cred, &dflags);
	if (error) {
	    if (keeperror(nd, error))
		    break;
	    error = 0;
	}

	/*
	 * Extract the next (or last) path component.  Path components are
	 * limited to 255 characters.
	 */
	nlc.nlc_nameptr = ptr;
	nlc.nlc_namelen = nptr - ptr;
	ptr = nptr;
	if (nlc.nlc_namelen >= 256) {
	    error = ENAMETOOLONG;
	    break;
	}

	/*
	 * Lookup the path component in the cache, creating an unresolved
	 * entry if necessary.  We have to handle "." and ".." as special
	 * cases.
	 *
	 * When handling ".." we have to detect a traversal back through a
	 * mount point.   If we are at the root, ".." just returns the root.
	 *
	 * When handling "." or ".." we also have to recalculate dflags
	 * since our dflags will be for some sub-directory instead of the
	 * parent dir.
	 *
	 * This subsection returns a locked, refd 'nch' unless it errors out,
	 * and an unlocked but still ref'd nd->nl_nch.
	 *
	 * The namecache topology is not allowed to be disconnected, so 
	 * encountering a NULL parent will generate EINVAL.  This typically
	 * occurs when a directory is removed out from under a process.
	 *
	 * WARNING! The unlocking of nd->nl_nch is sensitive code.
	 */
	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);

	if (nlc.nlc_namelen == 1 && nlc.nlc_nameptr[0] == '.') {
	    cache_unlock(&nd->nl_nch);
	    nd->nl_flags &= ~NLC_NCPISLOCKED;
	    cache_get_maybe_shared(&nd->nl_nch, &nch, wantsexcllock(nd, ptr));
	    wasdotordotdot = 1;
	} else if (nlc.nlc_namelen == 2 && 
		   nlc.nlc_nameptr[0] == '.' && nlc.nlc_nameptr[1] == '.') {
	    if (nd->nl_nch.mount == nd->nl_rootnch.mount &&
		nd->nl_nch.ncp == nd->nl_rootnch.ncp
	    ) {
		/*
		 * ".." at the root returns the root
		 */
		cache_unlock(&nd->nl_nch);
		nd->nl_flags &= ~NLC_NCPISLOCKED;
		cache_get_maybe_shared(&nd->nl_nch, &nch,
				       wantsexcllock(nd, ptr));
	    } else {
		/*
		 * Locate the parent ncp.  If we are at the root of a
		 * filesystem mount we have to skip to the mounted-on
		 * point in the underlying filesystem.
		 *
		 * Expect the parent to always be good since the
		 * mountpoint doesn't go away.  XXX hack.  cache_get()
		 * requires the ncp to already have a ref as a safety.
		 *
		 * However, a process which has been broken out of a chroot
		 * will wind up with a NULL parent if it tries to '..' above
		 * the real root, deal with the case.  Note that this does
		 * not protect us from a jail breakout, it just stops a panic
		 * if the jail-broken process tries to '..' past the real
		 * root.
		 */
		nctmp = nd->nl_nch;
		while (nctmp.ncp == nctmp.mount->mnt_ncmountpt.ncp) {
			nctmp = nctmp.mount->mnt_ncmounton;
			if (nctmp.ncp == NULL)
				break;
		}
		if (nctmp.ncp == NULL) {
			if (curthread->td_proc) {
				kprintf("vfs_nlookup: '..' traverse broke "
					"jail: pid %d (%s)\n",
					curthread->td_proc->p_pid,
					curthread->td_comm);
			}
			nctmp = nd->nl_rootnch;
		} else {
			nctmp.ncp = nctmp.ncp->nc_parent;
		}
		cache_hold(&nctmp);
		cache_unlock(&nd->nl_nch);
		nd->nl_flags &= ~NLC_NCPISLOCKED;
		cache_get_maybe_shared(&nctmp, &nch, wantsexcllock(nd, ptr));
		cache_drop(&nctmp);		/* NOTE: zero's nctmp */
	    }
	    wasdotordotdot = 2;
	} else {
	    /*
	     * Must unlock nl_nch when traversing down the path.  However,
	     * the child ncp has not yet been found/created and the parent's
	     * child list might be empty.  Thus releasing the lock can
	     * allow a race whereby the parent ncp's vnode is recycled.
	     * This case can occur especially when maxvnodes is set very low.
	     *
	     * We need the parent's ncp to remain resolved for all normal
	     * filesystem activities, so we vhold() the vp during the lookup
	     * to prevent recyclement due to vnlru / maxvnodes.
	     *
	     * If we race an unlink or rename the ncp might be marked
	     * DESTROYED after resolution, requiring a retry.
	     */
	    if ((hvp = nd->nl_nch.ncp->nc_vp) != NULL)
		vhold(hvp);
	    cache_unlock(&nd->nl_nch);
	    nd->nl_flags &= ~NLC_NCPISLOCKED;
	    error = cache_nlookup_maybe_shared(&nd->nl_nch, &nlc,
					       wantsexcllock(nd, ptr), &nch);
	    if (error == EWOULDBLOCK) {
		    nch = cache_nlookup(&nd->nl_nch, &nlc);
		    if (nch.ncp->nc_flag & NCF_UNRESOLVED)
			hit = 0;
		    for (;;) {
			error = cache_resolve(&nch, nd->nl_cred);
			if (error != EAGAIN &&
			    (nch.ncp->nc_flag & NCF_DESTROYED) == 0) {
				if (error == ESTALE) {
				    if (!inretry)
					error = ENOENT;
				    doretry = TRUE;
				}
				break;
			}
			kprintf("[diagnostic] nlookup: relookup %*.*s\n",
				nch.ncp->nc_nlen, nch.ncp->nc_nlen,
				nch.ncp->nc_name);
			cache_put(&nch);
			nch = cache_nlookup(&nd->nl_nch, &nlc);
		    }
	    }
	    if (hvp)
		vdrop(hvp);
	    wasdotordotdot = 0;
	}

	/*
	 * If the last component was "." or ".." our dflags no longer
	 * represents the parent directory and we have to explicitly
	 * look it up.
	 *
	 * Expect the parent to be good since nch is locked.
	 */
	if (wasdotordotdot && error == 0) {
	    dflags = 0;
	    if ((par.ncp = nch.ncp->nc_parent) != NULL) {
		par.mount = nch.mount;
		cache_hold(&par);
		cache_lock_maybe_shared(&par, wantsexcllock(nd, ptr));
		error = naccess(&par, 0, nd->nl_cred, &dflags);
		cache_put(&par);
		if (error) {
		    if (!keeperror(nd, error))
			    error = 0;
		}
	    }
	}

	/*
	 * [end of subsection]
	 *
	 * nch is locked and referenced.
	 * nd->nl_nch is unlocked and referenced.
	 *
	 * nl_nch must be unlocked or we could chain lock to the root
	 * if a resolve gets stuck (e.g. in NFS).
	 */
	KKASSERT((nd->nl_flags & NLC_NCPISLOCKED) == 0);

	/*
	 * Resolve the namespace if necessary.  The ncp returned by
	 * cache_nlookup() is referenced and locked.
	 *
	 * XXX neither '.' nor '..' should return EAGAIN since they were
	 * previously resolved and thus cannot be newly created ncp's.
	 */
	if (nch.ncp->nc_flag & NCF_UNRESOLVED) {
	    hit = 0;
	    error = cache_resolve(&nch, nd->nl_cred);
	    if (error == ESTALE) {
		if (!inretry)
		    error = ENOENT;
		doretry = TRUE;
	    }
	    KKASSERT(error != EAGAIN);
	} else {
	    error = nch.ncp->nc_error;
	}

	/*
	 * Early completion.  ENOENT is not an error if this is the last
	 * component and NLC_CREATE or NLC_RENAME (rename target) was
	 * requested.  Note that ncp->nc_error is left as ENOENT in that
	 * case, which we check later on.
	 *
	 * Also handle invalid '.' or '..' components terminating a path
	 * for a create/rename/delete.  The standard requires this and pax
	 * pretty stupidly depends on it.
	 */
	if (islastelement(ptr)) {
	    if (error == ENOENT &&
		(nd->nl_flags & (NLC_CREATE | NLC_RENAME_DST))
	    ) {
		if (nd->nl_flags & NLC_NFS_RDONLY) {
			error = EROFS;
		} else {
			error = naccess(&nch, nd->nl_flags | dflags,
					nd->nl_cred, NULL);
		}
	    }
	    if (error == 0 && wasdotordotdot &&
		(nd->nl_flags & (NLC_CREATE | NLC_DELETE |
				 NLC_RENAME_SRC | NLC_RENAME_DST))) {
		/*
		 * POSIX junk
		 */
		if (nd->nl_flags & NLC_CREATE)
			error = EEXIST;
		else if (nd->nl_flags & NLC_DELETE)
			error = (wasdotordotdot == 1) ? EINVAL : ENOTEMPTY;
		else
			error = EINVAL;
	    }
	}

	/*
	 * Early completion on error.
	 */
	if (error) {
	    cache_put(&nch);
	    break;
	}

	/*
	 * If the element is a symlink and it is either not the last
	 * element or it is the last element and we are allowed to
	 * follow symlinks, resolve the symlink.
	 */
	if ((nch.ncp->nc_flag & NCF_ISSYMLINK) &&
	    (*ptr || (nd->nl_flags & NLC_FOLLOW))
	) {
	    if (nd->nl_loopcnt++ >= MAXSYMLINKS) {
		error = ELOOP;
		cache_put(&nch);
		break;
	    }
	    error = nreadsymlink(nd, &nch, &nlc);
	    cache_put(&nch);
	    if (error)
		break;

	    /*
	     * Concatenate trailing path elements onto the returned symlink.
	     * Note that if the path component (ptr) is not exhausted, it
	     * will being with a '/', so we do not have to add another one.
	     *
	     * The symlink may not be empty.
	     */
	    len = strlen(ptr);
	    if (nlc.nlc_namelen == 0 || nlc.nlc_namelen + len >= MAXPATHLEN) {
		error = nlc.nlc_namelen ? ENAMETOOLONG : ENOENT;
		objcache_put(namei_oc, nlc.nlc_nameptr);
		break;
	    }
	    bcopy(ptr, nlc.nlc_nameptr + nlc.nlc_namelen, len + 1);
	    if (nd->nl_flags & NLC_HASBUF)
		objcache_put(namei_oc, nd->nl_path);
	    nd->nl_path = nlc.nlc_nameptr;
	    nd->nl_flags |= NLC_HASBUF;
	    ptr = nd->nl_path;

	    /*
	     * Go back up to the top to resolve any initial '/'s in the
	     * symlink.
	     */
	    continue;
	}
	
	/*
	 * If the element is a directory and we are crossing a mount point,
	 * Locate the mount.
	 */
	while ((nch.ncp->nc_flag & NCF_ISMOUNTPT) && 
	    (nd->nl_flags & NLC_NOCROSSMOUNT) == 0 &&
	    (mp = cache_findmount(&nch)) != NULL
	) {
	    struct vnode *tdp;
	    int vfs_do_busy = 0;

	    /*
	     * VFS must be busied before the namecache entry is locked,
	     * but we don't want to waste time calling vfs_busy() if the
	     * mount point is already resolved.
	     */
again:
	    cache_put(&nch);
	    if (vfs_do_busy) {
		while (vfs_busy(mp, 0)) {
		    if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
			kprintf("nlookup: warning umount race avoided\n");
			cache_dropmount(mp);
			error = EBUSY;
			vfs_do_busy = 0;
			goto double_break;
		    }
		}
	    }
	    cache_get_maybe_shared(&mp->mnt_ncmountpt, &nch,
				   wantsexcllock(nd, ptr));

	    if (nch.ncp->nc_flag & NCF_UNRESOLVED) {
		if (vfs_do_busy == 0) {
		    vfs_do_busy = 1;
		    goto again;
		}
		error = VFS_ROOT(mp, &tdp);
		vfs_unbusy(mp);
		vfs_do_busy = 0;
		if (keeperror(nd, error)) {
		    cache_dropmount(mp);
		    break;
		}
		if (error == 0) {
		    cache_setvp(&nch, tdp);
		    vput(tdp);
		}
	    }
	    if (vfs_do_busy)
		vfs_unbusy(mp);
	    cache_dropmount(mp);
	}

	if (keeperror(nd, error)) {
	    cache_put(&nch);
double_break:
	    break;
	}
	    
	/*
	 * Skip any slashes to get to the next element.  If there 
	 * are any slashes at all the current element must be a
	 * directory or, in the create case, intended to become a directory.
	 * If it isn't we break without incrementing ptr and fall through
	 * to the failure case below.
	 */
	while (*ptr == '/') {
	    if ((nch.ncp->nc_flag & NCF_ISDIR) == 0 && 
		!(nd->nl_flags & NLC_WILLBEDIR)
	    ) {
		break;
	    }
	    ++ptr;
	}

	/*
	 * Continuation case: additional elements and the current
	 * element is a directory.
	 */
	if (*ptr && (nch.ncp->nc_flag & NCF_ISDIR)) {
	    if (nd->nl_flags & NLC_NCDIR) {
		    cache_drop_ncdir(&nd->nl_nch);
		    nd->nl_flags &= ~NLC_NCDIR;
	    } else {
		    cache_drop(&nd->nl_nch);
	    }
	    cache_unlock(&nch);
	    KKASSERT((nd->nl_flags & NLC_NCPISLOCKED) == 0);
	    nd->nl_nch = nch;
	    continue;
	}

	/*
	 * Failure case: additional elements and the current element
	 * is not a directory
	 */
	if (*ptr) {
	    cache_put(&nch);
	    error = ENOTDIR;
	    break;
	}

	/*
	 * Successful lookup of last element.
	 *
	 * Check permissions if the target exists.  If the target does not
	 * exist directory permissions were already tested in the early
	 * completion code above.
	 *
	 * nd->nl_flags will be adjusted on return with NLC_APPENDONLY
	 * if the file is marked append-only, and NLC_STICKY if the directory
	 * containing the file is sticky.
	 */
	if (nch.ncp->nc_vp && (nd->nl_flags & NLC_ALLCHKS)) {
	    error = naccess(&nch, nd->nl_flags | dflags,
			    nd->nl_cred, NULL);
	    if (keeperror(nd, error)) {
		cache_put(&nch);
		break;
	    }
	}

	/*
	 * Termination: no more elements.
	 *
	 * If NLC_REFDVP is set acquire a referenced parent dvp.
	 */
	if (nd->nl_flags & NLC_REFDVP) {
		cache_lock(&nd->nl_nch);
		error = cache_vref(&nd->nl_nch, nd->nl_cred, &nd->nl_dvp);
		cache_unlock(&nd->nl_nch);
		if (keeperror(nd, error)) {
			kprintf("NLC_REFDVP: Cannot ref dvp of %p\n", nch.ncp);
			cache_put(&nch);
			break;
		}
	}
	if (nd->nl_flags & NLC_NCDIR) {
		cache_drop_ncdir(&nd->nl_nch);
		nd->nl_flags &= ~NLC_NCDIR;
	} else {
		cache_drop(&nd->nl_nch);
	}
	nd->nl_nch = nch;
	nd->nl_flags |= NLC_NCPISLOCKED;
	error = 0;
	break;
    }

    if (hit)
	++gd->gd_nchstats->ncs_longhits;
    else
	++gd->gd_nchstats->ncs_longmiss;

    if (nd->nl_flags & NLC_NCPISLOCKED)
	KKASSERT(cache_lockstatus(&nd->nl_nch) > 0);

    /*
     * Retry the whole thing if doretry flag is set, but only once.
     * autofs(5) may mount another filesystem under its root directory
     * while resolving a path.
     */
    if (doretry && !inretry) {
	inretry = TRUE;
	nd->nl_flags &= NLC_NCDIR;
	nd->nl_flags |= saveflag;
	goto nlookup_start;
    }

    /*
     * NOTE: If NLC_CREATE was set the ncp may represent a negative hit
     * (ncp->nc_error will be ENOENT), but we will still return an error
     * code of 0.
     */
    return(error);
}
Ejemplo n.º 15
0
/*
 * vp is the current namei directory
 * ndp is the name to locate in that directory...
 */
static int
fdesc_lookup(struct vop_lookup_args *ap)
{
	struct vnode **vpp = ap->a_vpp;
	struct vnode *dvp = ap->a_dvp;
	struct componentname *cnp = ap->a_cnp;
	char *pname = cnp->cn_nameptr;
	struct thread *td = cnp->cn_thread;
	struct file *fp;
	struct fdesc_get_ino_args arg;
	cap_rights_t rights;
	int nlen = cnp->cn_namelen;
	u_int fd, fd1;
	int error;
	struct vnode *fvp;

	if ((cnp->cn_flags & ISLASTCN) &&
	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
		error = EROFS;
		goto bad;
	}

	if (cnp->cn_namelen == 1 && *pname == '.') {
		*vpp = dvp;
		VREF(dvp);
		return (0);
	}

	if (VTOFDESC(dvp)->fd_type != Froot) {
		error = ENOTDIR;
		goto bad;
	}

	fd = 0;
	/* the only time a leading 0 is acceptable is if it's "0" */
	if (*pname == '0' && nlen != 1) {
		error = ENOENT;
		goto bad;
	}
	while (nlen--) {
		if (*pname < '0' || *pname > '9') {
			error = ENOENT;
			goto bad;
		}
		fd1 = 10 * fd + *pname++ - '0';
		if (fd1 < fd) {
			error = ENOENT;
			goto bad;
		}
		fd = fd1;
	}

	/*
	 * No rights to check since 'fp' isn't actually used.
	 */
	if ((error = fget(td, fd, cap_rights_init(&rights), &fp)) != 0)
		goto bad;

	/* Check if we're looking up ourselves. */
	if (VTOFDESC(dvp)->fd_ix == FD_DESC + fd) {
		/*
		 * In case we're holding the last reference to the file, the dvp
		 * will be re-acquired.
		 */
		vhold(dvp);
		VOP_UNLOCK(dvp, 0);
		fdrop(fp, td);

		/* Re-aquire the lock afterwards. */
		vn_lock(dvp, LK_RETRY | LK_EXCLUSIVE);
		vdrop(dvp);
		fvp = dvp;
		if ((dvp->v_iflag & VI_DOOMED) != 0)
			error = ENOENT;
	} else {
		/*
		 * Unlock our root node (dvp) when doing this, since we might
		 * deadlock since the vnode might be locked by another thread
		 * and the root vnode lock will be obtained afterwards (in case
		 * we're looking up the fd of the root vnode), which will be the
		 * opposite lock order. Vhold the root vnode first so we don't
		 * lose it.
		 */
		arg.ftype = Fdesc;
		arg.fd_fd = fd;
		arg.ix = FD_DESC + fd;
		arg.fp = fp;
		arg.td = td;
		error = vn_vget_ino_gen(dvp, fdesc_get_ino_alloc, &arg,
		    LK_EXCLUSIVE, &fvp);
	}
	
	if (error)
		goto bad;
	*vpp = fvp;
	return (0);

bad:
	*vpp = NULL;
	return (error);
}
Ejemplo n.º 16
0
/*
 * This is very similar to vmntvnodescan() but it only scans the
 * vnodes on the syncer list.  VFS's which support faster VFS_SYNC
 * operations use the VISDIRTY flag on the vnode to ensure that vnodes
 * with dirty inodes are added to the syncer in addition to vnodes
 * with dirty buffers, and can use this function instead of nmntvnodescan().
 * 
 * This is important when a system has millions of vnodes.
 */
int
vsyncscan(
    struct mount *mp,
    int vmsc_flags,
    int (*slowfunc)(struct mount *mp, struct vnode *vp, void *data),
    void *data
) {
	struct syncer_ctx *ctx;
	struct synclist *slp;
	struct vnode *vp;
	int b;
	int i;
	int lkflags;

	if (vmsc_flags & VMSC_NOWAIT)
		lkflags = LK_NOWAIT;
	else
		lkflags = 0;

	/*
	 * Syncer list context.  This API requires a dedicated syncer thread.
	 * (MNTK_THR_SYNC).
	 */
	KKASSERT(mp->mnt_kern_flag & MNTK_THR_SYNC);
	ctx = mp->mnt_syncer_ctx;
	lwkt_gettoken(&ctx->sc_token);

	/*
	 * Setup for loop.  Allow races against the syncer thread but
	 * require that the syncer thread no be lazy if we were told
	 * not to be lazy.
	 */
	b = ctx->syncer_delayno & ctx->syncer_mask;
	i = b;
	if ((vmsc_flags & VMSC_NOWAIT) == 0)
		++ctx->syncer_forced;

	do {
		slp = &ctx->syncer_workitem_pending[i];

		while ((vp = LIST_FIRST(slp)) != NULL) {
			KKASSERT(vp->v_mount == mp);
			if (vmsc_flags & VMSC_GETVP) {
				if (vget(vp, LK_EXCLUSIVE | lkflags) == 0) {
					slowfunc(mp, vp, data);
					vput(vp);
				}
			} else if (vmsc_flags & VMSC_GETVX) {
				vx_get(vp);
				slowfunc(mp, vp, data);
				vx_put(vp);
			} else {
				vhold(vp);
				slowfunc(mp, vp, data);
				vdrop(vp);
			}
			if (LIST_FIRST(slp) == vp)
				vn_syncer_add(vp, -(i + syncdelay));
		}
		i = (i + 1) & ctx->syncer_mask;
	} while (i != b);

	if ((vmsc_flags & VMSC_NOWAIT) == 0)
		--ctx->syncer_forced;
	lwkt_reltoken(&ctx->sc_token);
	return(0);
}
Ejemplo n.º 17
0
/*
 * Get the vnode associated with the given inode, allocating the vnode if
 * necessary.  The vnode will be returned exclusively locked.
 *
 * The caller must lock the inode (shared or exclusive).
 *
 * Great care must be taken to avoid deadlocks and vnode acquisition/reclaim
 * races.
 */
struct vnode *
hammer2_igetv(hammer2_inode_t *ip, int *errorp)
{
	hammer2_inode_data_t *ipdata;
	hammer2_pfsmount_t *pmp;
	struct vnode *vp;
	ccms_state_t ostate;

	pmp = ip->pmp;
	KKASSERT(pmp != NULL);
	*errorp = 0;
	ipdata = &ip->chain->data->ipdata;

	for (;;) {
		/*
		 * Attempt to reuse an existing vnode assignment.  It is
		 * possible to race a reclaim so the vget() may fail.  The
		 * inode must be unlocked during the vget() to avoid a
		 * deadlock against a reclaim.
		 */
		vp = ip->vp;
		if (vp) {
			/*
			 * Inode must be unlocked during the vget() to avoid
			 * possible deadlocks, but leave the ip ref intact.
			 *
			 * vnode is held to prevent destruction during the
			 * vget().  The vget() can still fail if we lost
			 * a reclaim race on the vnode.
			 */
			vhold(vp);
			ostate = hammer2_inode_lock_temp_release(ip);
			if (vget(vp, LK_EXCLUSIVE)) {
				vdrop(vp);
				hammer2_inode_lock_temp_restore(ip, ostate);
				continue;
			}
			hammer2_inode_lock_temp_restore(ip, ostate);
			vdrop(vp);
			/* vp still locked and ref from vget */
			if (ip->vp != vp) {
				kprintf("hammer2: igetv race %p/%p\n",
					ip->vp, vp);
				vput(vp);
				continue;
			}
			*errorp = 0;
			break;
		}

		/*
		 * No vnode exists, allocate a new vnode.  Beware of
		 * allocation races.  This function will return an
		 * exclusively locked and referenced vnode.
		 */
		*errorp = getnewvnode(VT_HAMMER2, pmp->mp, &vp, 0, 0);
		if (*errorp) {
			kprintf("hammer2: igetv getnewvnode failed %d\n",
				*errorp);
			vp = NULL;
			break;
		}

		/*
		 * Lock the inode and check for an allocation race.
		 */
		ostate = hammer2_inode_lock_upgrade(ip);
		if (ip->vp != NULL) {
			vp->v_type = VBAD;
			vx_put(vp);
			hammer2_inode_lock_downgrade(ip, ostate);
			continue;
		}

		switch (ipdata->type) {
		case HAMMER2_OBJTYPE_DIRECTORY:
			vp->v_type = VDIR;
			break;
		case HAMMER2_OBJTYPE_REGFILE:
			vp->v_type = VREG;
			vinitvmio(vp, ipdata->size,
				  HAMMER2_LBUFSIZE,
				  (int)ipdata->size & HAMMER2_LBUFMASK);
			break;
		case HAMMER2_OBJTYPE_SOFTLINK:
			/*
			 * XXX for now we are using the generic file_read
			 * and file_write code so we need a buffer cache
			 * association.
			 */
			vp->v_type = VLNK;
			vinitvmio(vp, ipdata->size,
				  HAMMER2_LBUFSIZE,
				  (int)ipdata->size & HAMMER2_LBUFMASK);
			break;
		case HAMMER2_OBJTYPE_CDEV:
			vp->v_type = VCHR;
			/* fall through */
		case HAMMER2_OBJTYPE_BDEV:
			vp->v_ops = &pmp->mp->mnt_vn_spec_ops;
			if (ipdata->type != HAMMER2_OBJTYPE_CDEV)
				vp->v_type = VBLK;
			addaliasu(vp, ipdata->rmajor, ipdata->rminor);
			break;
		case HAMMER2_OBJTYPE_FIFO:
			vp->v_type = VFIFO;
			vp->v_ops = &pmp->mp->mnt_vn_fifo_ops;
			break;
		default:
			panic("hammer2: unhandled objtype %d", ipdata->type);
			break;
		}

		if (ip == pmp->iroot)
			vsetflags(vp, VROOT);

		vp->v_data = ip;
		ip->vp = vp;
		hammer2_inode_ref(ip);		/* vp association */
		hammer2_inode_lock_downgrade(ip, ostate);
		break;
	}

	/*
	 * Return non-NULL vp and *errorp == 0, or NULL vp and *errorp != 0.
	 */
	if (hammer2_debug & 0x0002) {
		kprintf("igetv vp %p refs 0x%08x aux 0x%08x\n",
			vp, vp->v_refcnt, vp->v_auxrefs);
	}
	return (vp);
}
Ejemplo n.º 18
0
/*
 * Convert a vnode to its component name
 */
static int
pfs_vptocnp(struct vop_vptocnp_args *ap)
{
	struct vnode *vp = ap->a_vp;
	struct vnode **dvp = ap->a_vpp;
	struct pfs_vdata *pvd = vp->v_data;
	struct pfs_node *pd = pvd->pvd_pn;
	struct pfs_node *pn;
	struct mount *mp;
	char *buf = ap->a_buf;
	int *buflen = ap->a_buflen;
	char pidbuf[PFS_NAMELEN];
	pid_t pid = pvd->pvd_pid;
	int len, i, error, locked;

	i = *buflen;
	error = 0;

	pfs_lock(pd);

	if (vp->v_type == VDIR && pd->pn_type == pfstype_root) {
		*dvp = vp;
		vhold(*dvp);
		pfs_unlock(pd);
		PFS_RETURN (0);
	} else if (vp->v_type == VDIR && pd->pn_type == pfstype_procdir) {
		len = snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
		i -= len;
		if (i < 0) {
			error = ENOMEM;
			goto failed;
		}
		bcopy(pidbuf, buf + i, len);
	} else {
		len = strlen(pd->pn_name);
		i -= len;
		if (i < 0) {
			error = ENOMEM;
			goto failed;
		}
		bcopy(pd->pn_name, buf + i, len);
	}

	pn = pd->pn_parent;
	pfs_unlock(pd);

	mp = vp->v_mount;
	error = vfs_busy(mp, 0);
	if (error)
		return (error);

	/*
	 * vp is held by caller.
	 */
	locked = VOP_ISLOCKED(vp);
	VOP_UNLOCK(vp, 0);

	error = pfs_vncache_alloc(mp, dvp, pn, pid);
	if (error) {
		vn_lock(vp, locked | LK_RETRY);
		vfs_unbusy(mp);
		PFS_RETURN(error);
	}

	*buflen = i;
	vhold(*dvp);
	vput(*dvp);
	vn_lock(vp, locked | LK_RETRY);
	vfs_unbusy(mp);

	PFS_RETURN (0);
failed:
	pfs_unlock(pd);
	PFS_RETURN(error);
}
Ejemplo n.º 19
0
static int
tmpfs_lookup(struct vop_cachedlookup_args *v)
{
	struct vnode *dvp = v->a_dvp;
	struct vnode **vpp = v->a_vpp;
	struct componentname *cnp = v->a_cnp;

	int error;
	struct tmpfs_dirent *de;
	struct tmpfs_node *dnode;

	dnode = VP_TO_TMPFS_DIR(dvp);
	*vpp = NULLVP;

	/* Check accessibility of requested node as a first step. */
	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
	if (error != 0)
		goto out;

	/* We cannot be requesting the parent directory of the root node. */
	MPASS(IMPLIES(dnode->tn_type == VDIR &&
	    dnode->tn_dir.tn_parent == dnode,
	    !(cnp->cn_flags & ISDOTDOT)));

	TMPFS_ASSERT_LOCKED(dnode);
	if (dnode->tn_dir.tn_parent == NULL) {
		error = ENOENT;
		goto out;
	}
	if (cnp->cn_flags & ISDOTDOT) {
		int ltype = 0;

		ltype = VOP_ISLOCKED(dvp);
		vhold(dvp);
		VOP_UNLOCK(dvp, 0);
		/* Allocate a new vnode on the matching entry. */
		error = tmpfs_alloc_vp(dvp->v_mount, dnode->tn_dir.tn_parent,
		    cnp->cn_lkflags, vpp);

		vn_lock(dvp, ltype | LK_RETRY);
		vdrop(dvp);
	} else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
		VREF(dvp);
		*vpp = dvp;
		error = 0;
	} else {
		de = tmpfs_dir_lookup(dnode, NULL, cnp);
		if (de != NULL && de->td_node == NULL)
			cnp->cn_flags |= ISWHITEOUT;
		if (de == NULL || de->td_node == NULL) {
			/* The entry was not found in the directory.
			 * This is OK if we are creating or renaming an
			 * entry and are working on the last component of
			 * the path name. */
			if ((cnp->cn_flags & ISLASTCN) &&
			    (cnp->cn_nameiop == CREATE || \
			    cnp->cn_nameiop == RENAME ||
			    (cnp->cn_nameiop == DELETE &&
			    cnp->cn_flags & DOWHITEOUT &&
			    cnp->cn_flags & ISWHITEOUT))) {
				error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred,
				    cnp->cn_thread);
				if (error != 0)
					goto out;

				/* Keep the component name in the buffer for
				 * future uses. */
				cnp->cn_flags |= SAVENAME;

				error = EJUSTRETURN;
			} else
				error = ENOENT;
		} else {
			struct tmpfs_node *tnode;

			/* The entry was found, so get its associated
			 * tmpfs_node. */
			tnode = de->td_node;

			/* If we are not at the last path component and
			 * found a non-directory or non-link entry (which
			 * may itself be pointing to a directory), raise
			 * an error. */
			if ((tnode->tn_type != VDIR &&
			    tnode->tn_type != VLNK) &&
			    !(cnp->cn_flags & ISLASTCN)) {
				error = ENOTDIR;
				goto out;
			}

			/* If we are deleting or renaming the entry, keep
			 * track of its tmpfs_dirent so that it can be
			 * easily deleted later. */
			if ((cnp->cn_flags & ISLASTCN) &&
			    (cnp->cn_nameiop == DELETE ||
			    cnp->cn_nameiop == RENAME)) {
				error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred,
				    cnp->cn_thread);
				if (error != 0)
					goto out;

				/* Allocate a new vnode on the matching entry. */
				error = tmpfs_alloc_vp(dvp->v_mount, tnode,
						cnp->cn_lkflags, vpp);
				if (error != 0)
					goto out;

				if ((dnode->tn_mode & S_ISTXT) &&
				  VOP_ACCESS(dvp, VADMIN, cnp->cn_cred, cnp->cn_thread) &&
				  VOP_ACCESS(*vpp, VADMIN, cnp->cn_cred, cnp->cn_thread)) {
					error = EPERM;
					vput(*vpp);
					*vpp = NULL;
					goto out;
				}
				cnp->cn_flags |= SAVENAME;
			} else {
				error = tmpfs_alloc_vp(dvp->v_mount, tnode,
						cnp->cn_lkflags, vpp);
			}
		}
	}

	/* Store the result of this lookup in the cache.  Avoid this if the
	 * request was for creation, as it does not improve timings on
	 * emprical tests. */
	if ((cnp->cn_flags & MAKEENTRY) && cnp->cn_nameiop != CREATE)
		cache_enter(dvp, *vpp, cnp);

out:
	/* If there were no errors, *vpp cannot be null and it must be
	 * locked. */
	MPASS(IFF(error == 0, *vpp != NULLVP && VOP_ISLOCKED(*vpp)));

	return error;
}
Ejemplo n.º 20
0
int
vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
    int fault_flags, vm_page_t *m_hold)
{
	vm_prot_t prot;
	long ahead, behind;
	int alloc_req, era, faultcount, nera, reqpage, result;
	boolean_t growstack, is_first_object_locked, wired;
	int map_generation;
	vm_object_t next_object;
	vm_page_t marray[VM_FAULT_READ_MAX];
	int hardfault;
	struct faultstate fs;
	struct vnode *vp;
	int locked, error;

	hardfault = 0;
	growstack = TRUE;
	PCPU_INC(cnt.v_vm_faults);
	fs.vp = NULL;
	faultcount = reqpage = 0;

RetryFault:;

	/*
	 * Find the backing store object and offset into it to begin the
	 * search.
	 */
	fs.map = map;
	result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry,
	    &fs.first_object, &fs.first_pindex, &prot, &wired);
	if (result != KERN_SUCCESS) {
		if (growstack && result == KERN_INVALID_ADDRESS &&
		    map != kernel_map) {
			result = vm_map_growstack(curproc, vaddr);
			if (result != KERN_SUCCESS)
				return (KERN_FAILURE);
			growstack = FALSE;
			goto RetryFault;
		}
		return (result);
	}

	map_generation = fs.map->timestamp;

	if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
		panic("vm_fault: fault on nofault entry, addr: %lx",
		    (u_long)vaddr);
	}

	/*
	 * Make a reference to this object to prevent its disposal while we
	 * are messing with it.  Once we have the reference, the map is free
	 * to be diddled.  Since objects reference their shadows (and copies),
	 * they will stay around as well.
	 *
	 * Bump the paging-in-progress count to prevent size changes (e.g. 
	 * truncation operations) during I/O.  This must be done after
	 * obtaining the vnode lock in order to avoid possible deadlocks.
	 */
	VM_OBJECT_WLOCK(fs.first_object);
	vm_object_reference_locked(fs.first_object);
	vm_object_pip_add(fs.first_object, 1);

	fs.lookup_still_valid = TRUE;

	if (wired)
		fault_type = prot | (fault_type & VM_PROT_COPY);

	fs.first_m = NULL;

	/*
	 * Search for the page at object/offset.
	 */
	fs.object = fs.first_object;
	fs.pindex = fs.first_pindex;
	while (TRUE) {
		/*
		 * If the object is dead, we stop here
		 */
		if (fs.object->flags & OBJ_DEAD) {
			unlock_and_deallocate(&fs);
			return (KERN_PROTECTION_FAILURE);
		}

		/*
		 * See if page is resident
		 */
		fs.m = vm_page_lookup(fs.object, fs.pindex);
		if (fs.m != NULL) {
			/* 
			 * check for page-based copy on write.
			 * We check fs.object == fs.first_object so
			 * as to ensure the legacy COW mechanism is
			 * used when the page in question is part of
			 * a shadow object.  Otherwise, vm_page_cowfault()
			 * removes the page from the backing object, 
			 * which is not what we want.
			 */
			vm_page_lock(fs.m);
			if ((fs.m->cow) && 
			    (fault_type & VM_PROT_WRITE) &&
			    (fs.object == fs.first_object)) {
				vm_page_cowfault(fs.m);
				unlock_and_deallocate(&fs);
				goto RetryFault;
			}

			/*
			 * Wait/Retry if the page is busy.  We have to do this
			 * if the page is busy via either VPO_BUSY or 
			 * vm_page_t->busy because the vm_pager may be using
			 * vm_page_t->busy for pageouts ( and even pageins if
			 * it is the vnode pager ), and we could end up trying
			 * to pagein and pageout the same page simultaneously.
			 *
			 * We can theoretically allow the busy case on a read
			 * fault if the page is marked valid, but since such
			 * pages are typically already pmap'd, putting that
			 * special case in might be more effort then it is 
			 * worth.  We cannot under any circumstances mess
			 * around with a vm_page_t->busy page except, perhaps,
			 * to pmap it.
			 */
			if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) {
				/*
				 * Reference the page before unlocking and
				 * sleeping so that the page daemon is less
				 * likely to reclaim it. 
				 */
				vm_page_aflag_set(fs.m, PGA_REFERENCED);
				vm_page_unlock(fs.m);
				if (fs.object != fs.first_object) {
					if (!VM_OBJECT_TRYWLOCK(
					    fs.first_object)) {
						VM_OBJECT_WUNLOCK(fs.object);
						VM_OBJECT_WLOCK(fs.first_object);
						VM_OBJECT_WLOCK(fs.object);
					}
					vm_page_lock(fs.first_m);
					vm_page_free(fs.first_m);
					vm_page_unlock(fs.first_m);
					vm_object_pip_wakeup(fs.first_object);
					VM_OBJECT_WUNLOCK(fs.first_object);
					fs.first_m = NULL;
				}
				unlock_map(&fs);
				if (fs.m == vm_page_lookup(fs.object,
				    fs.pindex)) {
					vm_page_sleep_if_busy(fs.m, TRUE,
					    "vmpfw");
				}
				vm_object_pip_wakeup(fs.object);
				VM_OBJECT_WUNLOCK(fs.object);
				PCPU_INC(cnt.v_intrans);
				vm_object_deallocate(fs.first_object);
				goto RetryFault;
			}
			vm_page_remque(fs.m);
			vm_page_unlock(fs.m);

			/*
			 * Mark page busy for other processes, and the 
			 * pagedaemon.  If it still isn't completely valid
			 * (readable), jump to readrest, else break-out ( we
			 * found the page ).
			 */
			vm_page_busy(fs.m);
			if (fs.m->valid != VM_PAGE_BITS_ALL)
				goto readrest;
			break;
		}

		/*
		 * Page is not resident, If this is the search termination
		 * or the pager might contain the page, allocate a new page.
		 */
		if (TRYPAGER || fs.object == fs.first_object) {
			if (fs.pindex >= fs.object->size) {
				unlock_and_deallocate(&fs);
				return (KERN_PROTECTION_FAILURE);
			}

			/*
			 * Allocate a new page for this object/offset pair.
			 *
			 * Unlocked read of the p_flag is harmless. At
			 * worst, the P_KILLED might be not observed
			 * there, and allocation can fail, causing
			 * restart and new reading of the p_flag.
			 */
			fs.m = NULL;
			if (!vm_page_count_severe() || P_KILLED(curproc)) {
#if VM_NRESERVLEVEL > 0
				if ((fs.object->flags & OBJ_COLORED) == 0) {
					fs.object->flags |= OBJ_COLORED;
					fs.object->pg_color = atop(vaddr) -
					    fs.pindex;
				}
#endif
				alloc_req = P_KILLED(curproc) ?
				    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL;
				if (fs.object->type != OBJT_VNODE &&
				    fs.object->backing_object == NULL)
					alloc_req |= VM_ALLOC_ZERO;
				fs.m = vm_page_alloc(fs.object, fs.pindex,
				    alloc_req);
			}
			if (fs.m == NULL) {
				unlock_and_deallocate(&fs);
				VM_WAITPFAULT;
				goto RetryFault;
			} else if (fs.m->valid == VM_PAGE_BITS_ALL)
				break;
		}

readrest:
		/*
		 * We have found a valid page or we have allocated a new page.
		 * The page thus may not be valid or may not be entirely 
		 * valid.
		 *
		 * Attempt to fault-in the page if there is a chance that the
		 * pager has it, and potentially fault in additional pages
		 * at the same time.
		 */
		if (TRYPAGER) {
			int rv;
			u_char behavior = vm_map_entry_behavior(fs.entry);

			if (behavior == MAP_ENTRY_BEHAV_RANDOM ||
			    P_KILLED(curproc)) {
				behind = 0;
				ahead = 0;
			} else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
				behind = 0;
				ahead = atop(fs.entry->end - vaddr) - 1;
				if (ahead > VM_FAULT_READ_AHEAD_MAX)
					ahead = VM_FAULT_READ_AHEAD_MAX;
				if (fs.pindex == fs.entry->next_read)
					vm_fault_cache_behind(&fs,
					    VM_FAULT_READ_MAX);
			} else {
				/*
				 * If this is a sequential page fault, then
				 * arithmetically increase the number of pages
				 * in the read-ahead window.  Otherwise, reset
				 * the read-ahead window to its smallest size.
				 */
				behind = atop(vaddr - fs.entry->start);
				if (behind > VM_FAULT_READ_BEHIND)
					behind = VM_FAULT_READ_BEHIND;
				ahead = atop(fs.entry->end - vaddr) - 1;
				era = fs.entry->read_ahead;
				if (fs.pindex == fs.entry->next_read) {
					nera = era + behind;
					if (nera > VM_FAULT_READ_AHEAD_MAX)
						nera = VM_FAULT_READ_AHEAD_MAX;
					behind = 0;
					if (ahead > nera)
						ahead = nera;
					if (era == VM_FAULT_READ_AHEAD_MAX)
						vm_fault_cache_behind(&fs,
						    VM_FAULT_CACHE_BEHIND);
				} else if (ahead > VM_FAULT_READ_AHEAD_MIN)
					ahead = VM_FAULT_READ_AHEAD_MIN;
				if (era != ahead)
					fs.entry->read_ahead = ahead;
			}

			/*
			 * Call the pager to retrieve the data, if any, after
			 * releasing the lock on the map.  We hold a ref on
			 * fs.object and the pages are VPO_BUSY'd.
			 */
			unlock_map(&fs);

			if (fs.object->type == OBJT_VNODE) {
				vp = fs.object->handle;
				if (vp == fs.vp)
					goto vnode_locked;
				else if (fs.vp != NULL) {
					vput(fs.vp);
					fs.vp = NULL;
				}
				locked = VOP_ISLOCKED(vp);

				if (locked != LK_EXCLUSIVE)
					locked = LK_SHARED;
				/* Do not sleep for vnode lock while fs.m is busy */
				error = vget(vp, locked | LK_CANRECURSE |
				    LK_NOWAIT, curthread);
				if (error != 0) {
					vhold(vp);
					release_page(&fs);
					unlock_and_deallocate(&fs);
					error = vget(vp, locked | LK_RETRY |
					    LK_CANRECURSE, curthread);
					vdrop(vp);
					fs.vp = vp;
					KASSERT(error == 0,
					    ("vm_fault: vget failed"));
					goto RetryFault;
				}
				fs.vp = vp;
			}
vnode_locked:
			KASSERT(fs.vp == NULL || !fs.map->system_map,
			    ("vm_fault: vnode-backed object mapped by system map"));

			/*
			 * now we find out if any other pages should be paged
			 * in at this time this routine checks to see if the
			 * pages surrounding this fault reside in the same
			 * object as the page for this fault.  If they do,
			 * then they are faulted in also into the object.  The
			 * array "marray" returned contains an array of
			 * vm_page_t structs where one of them is the
			 * vm_page_t passed to the routine.  The reqpage
			 * return value is the index into the marray for the
			 * vm_page_t passed to the routine.
			 *
			 * fs.m plus the additional pages are VPO_BUSY'd.
			 */
			faultcount = vm_fault_additional_pages(
			    fs.m, behind, ahead, marray, &reqpage);

			rv = faultcount ?
			    vm_pager_get_pages(fs.object, marray, faultcount,
				reqpage) : VM_PAGER_FAIL;

			if (rv == VM_PAGER_OK) {
				/*
				 * Found the page. Leave it busy while we play
				 * with it.
				 */

				/*
				 * Relookup in case pager changed page. Pager
				 * is responsible for disposition of old page
				 * if moved.
				 */
				fs.m = vm_page_lookup(fs.object, fs.pindex);
				if (!fs.m) {
					unlock_and_deallocate(&fs);
					goto RetryFault;
				}

				hardfault++;
				break; /* break to PAGE HAS BEEN FOUND */
			}
			/*
			 * Remove the bogus page (which does not exist at this
			 * object/offset); before doing so, we must get back
			 * our object lock to preserve our invariant.
			 *
			 * Also wake up any other process that may want to bring
			 * in this page.
			 *
			 * If this is the top-level object, we must leave the
			 * busy page to prevent another process from rushing
			 * past us, and inserting the page in that object at
			 * the same time that we are.
			 */
			if (rv == VM_PAGER_ERROR)
				printf("vm_fault: pager read error, pid %d (%s)\n",
				    curproc->p_pid, curproc->p_comm);
			/*
			 * Data outside the range of the pager or an I/O error
			 */
			/*
			 * XXX - the check for kernel_map is a kludge to work
			 * around having the machine panic on a kernel space
			 * fault w/ I/O error.
			 */
			if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) ||
				(rv == VM_PAGER_BAD)) {
				vm_page_lock(fs.m);
				vm_page_free(fs.m);
				vm_page_unlock(fs.m);
				fs.m = NULL;
				unlock_and_deallocate(&fs);
				return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE);
			}
			if (fs.object != fs.first_object) {
				vm_page_lock(fs.m);
				vm_page_free(fs.m);
				vm_page_unlock(fs.m);
				fs.m = NULL;
				/*
				 * XXX - we cannot just fall out at this
				 * point, m has been freed and is invalid!
				 */
			}
		}

		/*
		 * We get here if the object has default pager (or unwiring) 
		 * or the pager doesn't have the page.
		 */
		if (fs.object == fs.first_object)
			fs.first_m = fs.m;

		/*
		 * Move on to the next object.  Lock the next object before
		 * unlocking the current one.
		 */
		fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset);
		next_object = fs.object->backing_object;
		if (next_object == NULL) {
			/*
			 * If there's no object left, fill the page in the top
			 * object with zeros.
			 */
			if (fs.object != fs.first_object) {
				vm_object_pip_wakeup(fs.object);
				VM_OBJECT_WUNLOCK(fs.object);

				fs.object = fs.first_object;
				fs.pindex = fs.first_pindex;
				fs.m = fs.first_m;
				VM_OBJECT_WLOCK(fs.object);
			}
			fs.first_m = NULL;

			/*
			 * Zero the page if necessary and mark it valid.
			 */
			if ((fs.m->flags & PG_ZERO) == 0) {
				pmap_zero_page(fs.m);
			} else {
				PCPU_INC(cnt.v_ozfod);
			}
			PCPU_INC(cnt.v_zfod);
			fs.m->valid = VM_PAGE_BITS_ALL;
			break;	/* break to PAGE HAS BEEN FOUND */
		} else {
			KASSERT(fs.object != next_object,
			    ("object loop %p", next_object));
			VM_OBJECT_WLOCK(next_object);
			vm_object_pip_add(next_object, 1);
			if (fs.object != fs.first_object)
				vm_object_pip_wakeup(fs.object);
			VM_OBJECT_WUNLOCK(fs.object);
			fs.object = next_object;
		}
	}

	KASSERT((fs.m->oflags & VPO_BUSY) != 0,
	    ("vm_fault: not busy after main loop"));

	/*
	 * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock
	 * is held.]
	 */

	/*
	 * If the page is being written, but isn't already owned by the
	 * top-level object, we have to copy it into a new page owned by the
	 * top-level object.
	 */
	if (fs.object != fs.first_object) {
		/*
		 * We only really need to copy if we want to write it.
		 */
		if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
			/*
			 * This allows pages to be virtually copied from a 
			 * backing_object into the first_object, where the 
			 * backing object has no other refs to it, and cannot
			 * gain any more refs.  Instead of a bcopy, we just 
			 * move the page from the backing object to the 
			 * first object.  Note that we must mark the page 
			 * dirty in the first object so that it will go out 
			 * to swap when needed.
			 */
			is_first_object_locked = FALSE;
			if (
				/*
				 * Only one shadow object
				 */
				(fs.object->shadow_count == 1) &&
				/*
				 * No COW refs, except us
				 */
				(fs.object->ref_count == 1) &&
				/*
				 * No one else can look this object up
				 */
				(fs.object->handle == NULL) &&
				/*
				 * No other ways to look the object up
				 */
				((fs.object->type == OBJT_DEFAULT) ||
				 (fs.object->type == OBJT_SWAP)) &&
			    (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) &&
				/*
				 * We don't chase down the shadow chain
				 */
			    fs.object == fs.first_object->backing_object) {
				/*
				 * get rid of the unnecessary page
				 */
				vm_page_lock(fs.first_m);
				vm_page_free(fs.first_m);
				vm_page_unlock(fs.first_m);
				/*
				 * grab the page and put it into the 
				 * process'es object.  The page is 
				 * automatically made dirty.
				 */
				vm_page_lock(fs.m);
				vm_page_rename(fs.m, fs.first_object, fs.first_pindex);
				vm_page_unlock(fs.m);
				vm_page_busy(fs.m);
				fs.first_m = fs.m;
				fs.m = NULL;
				PCPU_INC(cnt.v_cow_optim);
			} else {
				/*
				 * Oh, well, lets copy it.
				 */
				pmap_copy_page(fs.m, fs.first_m);
				fs.first_m->valid = VM_PAGE_BITS_ALL;
				if (wired && (fault_flags &
				    VM_FAULT_CHANGE_WIRING) == 0) {
					vm_page_lock(fs.first_m);
					vm_page_wire(fs.first_m);
					vm_page_unlock(fs.first_m);
					
					vm_page_lock(fs.m);
					vm_page_unwire(fs.m, FALSE);
					vm_page_unlock(fs.m);
				}
				/*
				 * We no longer need the old page or object.
				 */
				release_page(&fs);
			}
			/*
			 * fs.object != fs.first_object due to above 
			 * conditional
			 */
			vm_object_pip_wakeup(fs.object);
			VM_OBJECT_WUNLOCK(fs.object);
			/*
			 * Only use the new page below...
			 */
			fs.object = fs.first_object;
			fs.pindex = fs.first_pindex;
			fs.m = fs.first_m;
			if (!is_first_object_locked)
				VM_OBJECT_WLOCK(fs.object);
			PCPU_INC(cnt.v_cow_faults);
			curthread->td_cow++;
		} else {
			prot &= ~VM_PROT_WRITE;
		}
	}

	/*
	 * We must verify that the maps have not changed since our last
	 * lookup.
	 */
	if (!fs.lookup_still_valid) {
		vm_object_t retry_object;
		vm_pindex_t retry_pindex;
		vm_prot_t retry_prot;

		if (!vm_map_trylock_read(fs.map)) {
			release_page(&fs);
			unlock_and_deallocate(&fs);
			goto RetryFault;
		}
		fs.lookup_still_valid = TRUE;
		if (fs.map->timestamp != map_generation) {
			result = vm_map_lookup_locked(&fs.map, vaddr, fault_type,
			    &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);

			/*
			 * If we don't need the page any longer, put it on the inactive
			 * list (the easiest thing to do here).  If no one needs it,
			 * pageout will grab it eventually.
			 */
			if (result != KERN_SUCCESS) {
				release_page(&fs);
				unlock_and_deallocate(&fs);

				/*
				 * If retry of map lookup would have blocked then
				 * retry fault from start.
				 */
				if (result == KERN_FAILURE)
					goto RetryFault;
				return (result);
			}
			if ((retry_object != fs.first_object) ||
			    (retry_pindex != fs.first_pindex)) {
				release_page(&fs);
				unlock_and_deallocate(&fs);
				goto RetryFault;
			}

			/*
			 * Check whether the protection has changed or the object has
			 * been copied while we left the map unlocked. Changing from
			 * read to write permission is OK - we leave the page
			 * write-protected, and catch the write fault. Changing from
			 * write to read permission means that we can't mark the page
			 * write-enabled after all.
			 */
			prot &= retry_prot;
		}
	}
	/*
	 * If the page was filled by a pager, update the map entry's
	 * last read offset.  Since the pager does not return the
	 * actual set of pages that it read, this update is based on
	 * the requested set.  Typically, the requested and actual
	 * sets are the same.
	 *
	 * XXX The following assignment modifies the map
	 * without holding a write lock on it.
	 */
	if (hardfault)
		fs.entry->next_read = fs.pindex + faultcount - reqpage;

	if ((prot & VM_PROT_WRITE) != 0 ||
	    (fault_flags & VM_FAULT_DIRTY) != 0) {
		vm_object_set_writeable_dirty(fs.object);

		/*
		 * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
		 * if the page is already dirty to prevent data written with
		 * the expectation of being synced from not being synced.
		 * Likewise if this entry does not request NOSYNC then make
		 * sure the page isn't marked NOSYNC.  Applications sharing
		 * data should use the same flags to avoid ping ponging.
		 */
		if (fs.entry->eflags & MAP_ENTRY_NOSYNC) {
			if (fs.m->dirty == 0)
				fs.m->oflags |= VPO_NOSYNC;
		} else {
			fs.m->oflags &= ~VPO_NOSYNC;
		}

		/*
		 * If the fault is a write, we know that this page is being
		 * written NOW so dirty it explicitly to save on 
		 * pmap_is_modified() calls later.
		 *
		 * Also tell the backing pager, if any, that it should remove
		 * any swap backing since the page is now dirty.
		 */
		if (((fault_type & VM_PROT_WRITE) != 0 &&
		    (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) ||
		    (fault_flags & VM_FAULT_DIRTY) != 0) {
			vm_page_dirty(fs.m);
			vm_pager_page_unswapped(fs.m);
		}
	}

	/*
	 * Page had better still be busy
	 */
	KASSERT(fs.m->oflags & VPO_BUSY,
		("vm_fault: page %p not busy!", fs.m));
	/*
	 * Page must be completely valid or it is not fit to
	 * map into user space.  vm_pager_get_pages() ensures this.
	 */
	KASSERT(fs.m->valid == VM_PAGE_BITS_ALL,
	    ("vm_fault: page %p partially invalid", fs.m));
	VM_OBJECT_WUNLOCK(fs.object);

	/*
	 * Put this page into the physical map.  We had to do the unlock above
	 * because pmap_enter() may sleep.  We don't put the page
	 * back on the active queue until later so that the pageout daemon
	 * won't find it (yet).
	 */
	pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired);
	if ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 && wired == 0)
		vm_fault_prefault(fs.map->pmap, vaddr, fs.entry);
	VM_OBJECT_WLOCK(fs.object);
	vm_page_lock(fs.m);

	/*
	 * If the page is not wired down, then put it where the pageout daemon
	 * can find it.
	 */
	if (fault_flags & VM_FAULT_CHANGE_WIRING) {
		if (wired)
			vm_page_wire(fs.m);
		else
			vm_page_unwire(fs.m, 1);
	} else
		vm_page_activate(fs.m);
	if (m_hold != NULL) {
		*m_hold = fs.m;
		vm_page_hold(fs.m);
	}
	vm_page_unlock(fs.m);
	vm_page_wakeup(fs.m);

	/*
	 * Unlock everything, and return
	 */
	unlock_and_deallocate(&fs);
	if (hardfault) {
		PCPU_INC(cnt.v_io_faults);
		curthread->td_ru.ru_majflt++;
	} else 
		curthread->td_ru.ru_minflt++;

	return (KERN_SUCCESS);
}
Ejemplo n.º 21
0
/*
 * We have to carry on the locking protocol on the null layer vnodes
 * as we progress through the tree. We also have to enforce read-only
 * if this layer is mounted read-only.
 */
static int
null_lookup(struct vop_lookup_args *ap)
{
	struct componentname *cnp = ap->a_cnp;
	struct vnode *dvp = ap->a_dvp;
	int flags = cnp->cn_flags;
	struct vnode *vp, *ldvp, *lvp;
	struct mount *mp;
	int error;

	mp = dvp->v_mount;
	if ((flags & ISLASTCN) != 0 && (mp->mnt_flag & MNT_RDONLY) != 0 &&
	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
		return (EROFS);
	/*
	 * Although it is possible to call null_bypass(), we'll do
	 * a direct call to reduce overhead
	 */
	ldvp = NULLVPTOLOWERVP(dvp);
	vp = lvp = NULL;
	KASSERT((ldvp->v_vflag & VV_ROOT) == 0 ||
	    ((dvp->v_vflag & VV_ROOT) != 0 && (flags & ISDOTDOT) == 0),
	    ("ldvp %p fl %#x dvp %p fl %#x flags %#x", ldvp, ldvp->v_vflag,
	     dvp, dvp->v_vflag, flags));

	/*
	 * Hold ldvp.  The reference on it, owned by dvp, is lost in
	 * case of dvp reclamation, and we need ldvp to move our lock
	 * from ldvp to dvp.
	 */
	vhold(ldvp);

	error = VOP_LOOKUP(ldvp, &lvp, cnp);

	/*
	 * VOP_LOOKUP() on lower vnode may unlock ldvp, which allows
	 * dvp to be reclaimed due to shared v_vnlock.  Check for the
	 * doomed state and return error.
	 */
	if ((error == 0 || error == EJUSTRETURN) &&
	    (dvp->v_iflag & VI_DOOMED) != 0) {
		error = ENOENT;
		if (lvp != NULL)
			vput(lvp);

		/*
		 * If vgone() did reclaimed dvp before curthread
		 * relocked ldvp, the locks of dvp and ldpv are no
		 * longer shared.  In this case, relock of ldvp in
		 * lower fs VOP_LOOKUP() does not restore the locking
		 * state of dvp.  Compensate for this by unlocking
		 * ldvp and locking dvp, which is also correct if the
		 * locks are still shared.
		 */
		VOP_UNLOCK(ldvp, 0);
		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
	}
	vdrop(ldvp);

	if (error == EJUSTRETURN && (flags & ISLASTCN) != 0 &&
	    (mp->mnt_flag & MNT_RDONLY) != 0 &&
	    (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME))
		error = EROFS;

	if ((error == 0 || error == EJUSTRETURN) && lvp != NULL) {
		if (ldvp == lvp) {
			*ap->a_vpp = dvp;
			VREF(dvp);
			vrele(lvp);
		} else {
			error = null_nodeget(mp, lvp, &vp);
			if (error == 0)
				*ap->a_vpp = vp;
		}
	}
	return (error);
}