Example #1
0
static int
tmpfs_write (struct vop_write_args *ap)
{
	struct buf *bp;
	struct vnode *vp = ap->a_vp;
	struct uio *uio = ap->a_uio;
	struct thread *td = uio->uio_td;
	struct tmpfs_node *node;
	boolean_t extended;
	off_t oldsize;
	int error;
	off_t base_offset;
	size_t offset;
	size_t len;
	struct rlimit limit;
	int trivial = 0;
	int kflags = 0;
	int seqcount;

	error = 0;
	if (uio->uio_resid == 0) {
		return error;
	}

	node = VP_TO_TMPFS_NODE(vp);

	if (vp->v_type != VREG)
		return (EINVAL);
	seqcount = ap->a_ioflag >> 16;

	TMPFS_NODE_LOCK(node);

	oldsize = node->tn_size;
	if (ap->a_ioflag & IO_APPEND)
		uio->uio_offset = node->tn_size;

	/*
	 * Check for illegal write offsets.
	 */
	if (uio->uio_offset + uio->uio_resid >
	  VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) {
		error = EFBIG;
		goto done;
	}

	/*
	 * NOTE: Ignore if UIO does not come from a user thread (e.g. VN).
	 */
	if (vp->v_type == VREG && td != NULL && td->td_lwp != NULL) {
		error = kern_getrlimit(RLIMIT_FSIZE, &limit);
		if (error)
			goto done;
		if (uio->uio_offset + uio->uio_resid > limit.rlim_cur) {
			ksignal(td->td_proc, SIGXFSZ);
			error = EFBIG;
			goto done;
		}
	}

	/*
	 * Extend the file's size if necessary
	 */
	extended = ((uio->uio_offset + uio->uio_resid) > node->tn_size);

	while (uio->uio_resid > 0) {
		/*
		 * Don't completely blow out running buffer I/O
		 * when being hit from the pageout daemon.
		 */
		if (uio->uio_segflg == UIO_NOCOPY &&
		    (ap->a_ioflag & IO_RECURSE) == 0) {
			bwillwrite(TMPFS_BLKSIZE);
		}

		/*
		 * Use buffer cache I/O (via tmpfs_strategy)
		 */
		offset = (size_t)uio->uio_offset & TMPFS_BLKMASK64;
		base_offset = (off_t)uio->uio_offset - offset;
		len = TMPFS_BLKSIZE - offset;
		if (len > uio->uio_resid)
			len = uio->uio_resid;

		if ((uio->uio_offset + len) > node->tn_size) {
			trivial = (uio->uio_offset <= node->tn_size);
			error = tmpfs_reg_resize(vp, uio->uio_offset + len,
						 trivial);
			if (error)
				break;
		}

		/*
		 * Read to fill in any gaps.  Theoretically we could
		 * optimize this if the write covers the entire buffer
		 * and is not a UIO_NOCOPY write, however this can lead
		 * to a security violation exposing random kernel memory
		 * (whatever junk was in the backing VM pages before).
		 *
		 * So just use bread() to do the right thing.
		 */
		error = bread(vp, base_offset, TMPFS_BLKSIZE, &bp);
		error = uiomovebp(bp, (char *)bp->b_data + offset, len, uio);
		if (error) {
			kprintf("tmpfs_write uiomove error %d\n", error);
			brelse(bp);
			break;
		}

		if (uio->uio_offset > node->tn_size) {
			node->tn_size = uio->uio_offset;
			kflags |= NOTE_EXTEND;
		}
		kflags |= NOTE_WRITE;

		/*
		 * Always try to flush the page in the UIO_NOCOPY case.  This
		 * can come from the pageout daemon or during vnode eviction.
		 * It is not necessarily going to be marked IO_ASYNC/IO_SYNC.
		 *
		 * For the normal case we buwrite(), dirtying the underlying
		 * VM pages instead of dirtying the buffer and releasing the
		 * buffer as a clean buffer.  This allows tmpfs to use
		 * essentially all available memory to cache file data.
		 * If we used bdwrite() the buffer cache would wind up
		 * flushing the data to swap too quickly.
		 *
		 * But because tmpfs can seriously load the VM system we
		 * fall-back to using bdwrite() when free memory starts
		 * to get low.  This shifts the load away from the VM system
		 * and makes tmpfs act more like a normal filesystem with
		 * regards to disk activity.
		 *
		 * tmpfs pretty much fiddles directly with the VM
		 * system, don't let it exhaust it or we won't play
		 * nice with other processes.  Only do this if the
		 * VOP is coming from a normal read/write.  The VM system
		 * handles the case for UIO_NOCOPY.
		 */
		bp->b_flags |= B_CLUSTEROK;
		if (uio->uio_segflg == UIO_NOCOPY) {
			/*
			 * Flush from the pageout daemon, deal with
			 * potentially very heavy tmpfs write activity
			 * causing long stalls in the pageout daemon
			 * before pages get to free/cache.
			 *
			 * (a) Under severe pressure setting B_DIRECT will
			 *     cause a buffer release to try to free the
			 *     underlying pages.
			 *
			 * (b) Under modest memory pressure the B_RELBUF
			 *     alone is sufficient to get the pages moved
			 *     to the cache.  We could also force this by
			 *     setting B_NOTMETA but that might have other
			 *     unintended side-effects (e.g. setting
			 *     PG_NOTMETA on the VM page).
			 *
			 * Hopefully this will unblock the VM system more
			 * quickly under extreme tmpfs write load.
			 */
			if (vm_page_count_min(vm_page_free_hysteresis))
				bp->b_flags |= B_DIRECT;
			bp->b_flags |= B_AGE | B_RELBUF;
			bp->b_act_count = 0;	/* buffer->deactivate pgs */
			cluster_awrite(bp);
		} else if (vm_page_count_target()) {
			/*
			 * Normal (userland) write but we are low on memory,
			 * run the buffer the buffer cache.
			 */
			bp->b_act_count = 0;	/* buffer->deactivate pgs */
			bdwrite(bp);
		} else {
			/*
			 * Otherwise run the buffer directly through to the
			 * backing VM store.
			 */
			buwrite(bp);
			/*vm_wait_nominal();*/
		}

		if (bp->b_error) {
			kprintf("tmpfs_write bwrite error %d\n", bp->b_error);
			break;
		}
	}

	if (error) {
		if (extended) {
			(void)tmpfs_reg_resize(vp, oldsize, trivial);
			kflags &= ~NOTE_EXTEND;
		}
		goto done;
	}

	/*
	 * Currently we don't set the mtime on files modified via mmap()
	 * because we can't tell the difference between those modifications
	 * and an attempt by the pageout daemon to flush tmpfs pages to
	 * swap.
	 *
	 * This is because in order to defer flushes as long as possible
	 * buwrite() works by marking the underlying VM pages dirty in
	 * order to be able to dispose of the buffer cache buffer without
	 * flushing it.
	 */
	if (uio->uio_segflg != UIO_NOCOPY)
		node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED;
	if (extended)
		node->tn_status |= TMPFS_NODE_CHANGED;

	if (node->tn_mode & (S_ISUID | S_ISGID)) {
		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0))
			node->tn_mode &= ~(S_ISUID | S_ISGID);
	}
done:
	TMPFS_NODE_UNLOCK(node);
	if (kflags)
		tmpfs_knote(vp, kflags);

	return(error);
}
Example #2
0
static int
tmpfs_write (struct vop_write_args *ap)
{
	struct buf *bp;
	struct vnode *vp = ap->a_vp;
	struct uio *uio = ap->a_uio;
	struct thread *td = uio->uio_td;
	struct tmpfs_node *node;
	boolean_t extended;
	off_t oldsize;
	int error;
	off_t base_offset;
	size_t offset;
	size_t len;
	struct rlimit limit;
	int trivial = 0;
	int kflags = 0;

	error = 0;
	if (uio->uio_resid == 0) {
		return error;
	}

	node = VP_TO_TMPFS_NODE(vp);

	if (vp->v_type != VREG)
		return (EINVAL);

	lwkt_gettoken(&vp->v_mount->mnt_token);

	oldsize = node->tn_size;
	if (ap->a_ioflag & IO_APPEND)
		uio->uio_offset = node->tn_size;

	/*
	 * Check for illegal write offsets.
	 */
	if (uio->uio_offset + uio->uio_resid >
	  VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) {
		lwkt_reltoken(&vp->v_mount->mnt_token);
		return (EFBIG);
	}

	if (vp->v_type == VREG && td != NULL) {
		error = kern_getrlimit(RLIMIT_FSIZE, &limit);
		if (error != 0) {
			lwkt_reltoken(&vp->v_mount->mnt_token);
			return error;
		}
		if (uio->uio_offset + uio->uio_resid > limit.rlim_cur) {
			ksignal(td->td_proc, SIGXFSZ);
			lwkt_reltoken(&vp->v_mount->mnt_token);
			return (EFBIG);
		}
	}


	/*
	 * Extend the file's size if necessary
	 */
	extended = ((uio->uio_offset + uio->uio_resid) > node->tn_size);

	while (uio->uio_resid > 0) {
		/*
		 * Use buffer cache I/O (via tmpfs_strategy)
		 */
		offset = (size_t)uio->uio_offset & BMASK;
		base_offset = (off_t)uio->uio_offset - offset;
		len = BSIZE - offset;
		if (len > uio->uio_resid)
			len = uio->uio_resid;

		if ((uio->uio_offset + len) > node->tn_size) {
			trivial = (uio->uio_offset <= node->tn_size);
			error = tmpfs_reg_resize(vp, uio->uio_offset + len,  trivial);
			if (error)
				break;
		}

		/*
		 * Read to fill in any gaps.  Theoretically we could
		 * optimize this if the write covers the entire buffer
		 * and is not a UIO_NOCOPY write, however this can lead
		 * to a security violation exposing random kernel memory
		 * (whatever junk was in the backing VM pages before).
		 *
		 * So just use bread() to do the right thing.
		 */
		error = bread(vp, base_offset, BSIZE, &bp);
		error = uiomovebp(bp, (char *)bp->b_data + offset, len, uio);
		if (error) {
			kprintf("tmpfs_write uiomove error %d\n", error);
			brelse(bp);
			break;
		}

		if (uio->uio_offset > node->tn_size) {
			node->tn_size = uio->uio_offset;
			kflags |= NOTE_EXTEND;
		}
		kflags |= NOTE_WRITE;

		/*
		 * Always try to flush the page if the request is coming
		 * from the pageout daemon (IO_ASYNC), else buwrite() the
		 * buffer.
		 *
		 * buwrite() dirties the underlying VM pages instead of
		 * dirtying the buffer, releasing the buffer as a clean
		 * buffer.  This allows tmpfs to use essentially all
		 * available memory to cache file data.  If we used bdwrite()
		 * the buffer cache would wind up flushing the data to
		 * swap too quickly.
		 */
		bp->b_flags |= B_AGE;
		if (ap->a_ioflag & IO_ASYNC) {
			bawrite(bp);
		} else {
			buwrite(bp);
		}

		if (bp->b_error) {
			kprintf("tmpfs_write bwrite error %d\n", bp->b_error);
			break;
		}
	}

	if (error) {
		if (extended) {
			(void)tmpfs_reg_resize(vp, oldsize, trivial);
			kflags &= ~NOTE_EXTEND;
		}
		goto done;
	}

	/*
	 * Currently we don't set the mtime on files modified via mmap()
	 * because we can't tell the difference between those modifications
	 * and an attempt by the pageout daemon to flush tmpfs pages to
	 * swap.
	 *
	 * This is because in order to defer flushes as long as possible
	 * buwrite() works by marking the underlying VM pages dirty in
	 * order to be able to dispose of the buffer cache buffer without
	 * flushing it.
	 */
	TMPFS_NODE_LOCK(node);
	if (uio->uio_segflg != UIO_NOCOPY)
		node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED;
	if (extended)
		node->tn_status |= TMPFS_NODE_CHANGED;

	if (node->tn_mode & (S_ISUID | S_ISGID)) {
		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0))
			node->tn_mode &= ~(S_ISUID | S_ISGID);
	}
	TMPFS_NODE_UNLOCK(node);
done:

	tmpfs_knote(vp, kflags);


	lwkt_reltoken(&vp->v_mount->mnt_token);
	return(error);
}