Пример #1
0
static void
filemon_output(struct filemon *filemon, char *msg, size_t len)
{
	struct uio auio;
	struct iovec aiov;
	int error;

	if (filemon->fp == NULL)
		return;

	aiov.iov_base = msg;
	aiov.iov_len = len;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_resid = len;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_rw = UIO_WRITE;
	auio.uio_td = curthread;
	auio.uio_offset = (off_t) -1;

	if (filemon->fp->f_type == DTYPE_VNODE)
		bwillwrite();

	error = fo_write(filemon->fp, &auio, curthread->td_ucred, 0, curthread);
	if (error != 0)
		filemon->error = error;
}
Пример #2
0
/*
 * Write buffer to filp inside kernel
 */
int
kern_file_write (cfs_file_t *fp, void *buf, size_t nbyte, loff_t *pos)
{
	struct uio auio;
	struct iovec aiov;
	struct proc *p = current_proc();
	long cnt, error = 0;
        int flags = 0;
        CFS_DECL_CONE_DATA;

	aiov.iov_base = (void *)(uintptr_t)buf;
	aiov.iov_len = nbyte;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
        if (pos != NULL) {
	        auio.uio_offset = *pos;
                /* 
                 * Liang: If don't set FOF_OFFSET, vn_write()
                 * will use fp->f_offset as the the real offset.
                 * Same in vn_read()
                 */
                flags |= FOF_OFFSET;
        } else
                auio.uio_offset = (off_t)-1;
	if (nbyte > INT_MAX)
		return (EINVAL);
	auio.uio_resid = nbyte;
	auio.uio_rw = UIO_WRITE;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_procp = p;

	cnt = nbyte;
        CFS_CONE_IN;
	if (fp->f_type == DTYPE_VNODE)
		bwillwrite();	/* empty stuff now */
	if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
		if (auio.uio_resid != cnt && (error == ERESTART ||\
		    error == EINTR || error == EWOULDBLOCK))
			error = 0;
		/* The socket layer handles SIGPIPE */
		if (error == EPIPE && fp->f_type != DTYPE_SOCKET)
			psignal(p, SIGPIPE);
	}
        CFS_CONE_EX;
	if (error != 0)
		cnt = -error;
	else
		cnt -= auio.uio_resid;
        if (pos != NULL)
                *pos += cnt;
	return cnt;
}
Пример #3
0
/*
 * Package up an I/O request on a vnode into a uio and do it.  The I/O
 * request is split up into smaller chunks and we try to avoid saturating
 * the buffer cache while potentially holding a vnode locked, so we 
 * check bwillwrite() before calling vn_rdwr().  We also call lwkt_user_yield()
 * to give other processes a chance to lock the vnode (either other processes
 * core'ing the same binary, or unrelated processes scanning the directory).
 *
 * MPSAFE
 */
int
vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
		 off_t offset, enum uio_seg segflg, int ioflg,
		 struct ucred *cred, int *aresid)
{
	int error = 0;

	do {
		int chunk;

		/*
		 * Force `offset' to a multiple of MAXBSIZE except possibly
		 * for the first chunk, so that filesystems only need to
		 * write full blocks except possibly for the first and last
		 * chunks.
		 */
		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;

		if (chunk > len)
			chunk = len;
		if (vp->v_type == VREG) {
			switch(rw) {
			case UIO_READ:
				bwillread(chunk);
				break;
			case UIO_WRITE:
				bwillwrite(chunk);
				break;
			}
		}
		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
				ioflg, cred, aresid);
		len -= chunk;	/* aresid calc already includes length */
		if (error)
			break;
		offset += chunk;
		base += chunk;
		lwkt_user_yield();
	} while (len);
	if (aresid)
		*aresid += len;
	return (error);
}
Пример #4
0
static int
write_bytes(struct diffarg *da)
{
	struct uio auio;
	struct iovec aiov;

	aiov.iov_base = (caddr_t)&da->da_ddr;
	aiov.iov_len = sizeof (da->da_ddr);
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_resid = aiov.iov_len;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_rw = UIO_WRITE;
	auio.uio_offset = (off_t)-1;
	auio.uio_td = da->da_td;
#ifdef _KERNEL
	if (da->da_fp->f_type == DTYPE_VNODE)
		bwillwrite();
	return (fo_write(da->da_fp, &auio, da->da_td->td_ucred, 0, da->da_td));
#else
	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
	return (EOPNOTSUPP);
#endif
}
Пример #5
0
static int
tmpfs_write (struct vop_write_args *ap)
{
	struct buf *bp;
	struct vnode *vp = ap->a_vp;
	struct uio *uio = ap->a_uio;
	struct thread *td = uio->uio_td;
	struct tmpfs_node *node;
	boolean_t extended;
	off_t oldsize;
	int error;
	off_t base_offset;
	size_t offset;
	size_t len;
	struct rlimit limit;
	int trivial = 0;
	int kflags = 0;
	int seqcount;

	error = 0;
	if (uio->uio_resid == 0) {
		return error;
	}

	node = VP_TO_TMPFS_NODE(vp);

	if (vp->v_type != VREG)
		return (EINVAL);
	seqcount = ap->a_ioflag >> 16;

	TMPFS_NODE_LOCK(node);

	oldsize = node->tn_size;
	if (ap->a_ioflag & IO_APPEND)
		uio->uio_offset = node->tn_size;

	/*
	 * Check for illegal write offsets.
	 */
	if (uio->uio_offset + uio->uio_resid >
	  VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) {
		error = EFBIG;
		goto done;
	}

	/*
	 * NOTE: Ignore if UIO does not come from a user thread (e.g. VN).
	 */
	if (vp->v_type == VREG && td != NULL && td->td_lwp != NULL) {
		error = kern_getrlimit(RLIMIT_FSIZE, &limit);
		if (error)
			goto done;
		if (uio->uio_offset + uio->uio_resid > limit.rlim_cur) {
			ksignal(td->td_proc, SIGXFSZ);
			error = EFBIG;
			goto done;
		}
	}

	/*
	 * Extend the file's size if necessary
	 */
	extended = ((uio->uio_offset + uio->uio_resid) > node->tn_size);

	while (uio->uio_resid > 0) {
		/*
		 * Don't completely blow out running buffer I/O
		 * when being hit from the pageout daemon.
		 */
		if (uio->uio_segflg == UIO_NOCOPY &&
		    (ap->a_ioflag & IO_RECURSE) == 0) {
			bwillwrite(TMPFS_BLKSIZE);
		}

		/*
		 * Use buffer cache I/O (via tmpfs_strategy)
		 */
		offset = (size_t)uio->uio_offset & TMPFS_BLKMASK64;
		base_offset = (off_t)uio->uio_offset - offset;
		len = TMPFS_BLKSIZE - offset;
		if (len > uio->uio_resid)
			len = uio->uio_resid;

		if ((uio->uio_offset + len) > node->tn_size) {
			trivial = (uio->uio_offset <= node->tn_size);
			error = tmpfs_reg_resize(vp, uio->uio_offset + len,
						 trivial);
			if (error)
				break;
		}

		/*
		 * Read to fill in any gaps.  Theoretically we could
		 * optimize this if the write covers the entire buffer
		 * and is not a UIO_NOCOPY write, however this can lead
		 * to a security violation exposing random kernel memory
		 * (whatever junk was in the backing VM pages before).
		 *
		 * So just use bread() to do the right thing.
		 */
		error = bread(vp, base_offset, TMPFS_BLKSIZE, &bp);
		error = uiomovebp(bp, (char *)bp->b_data + offset, len, uio);
		if (error) {
			kprintf("tmpfs_write uiomove error %d\n", error);
			brelse(bp);
			break;
		}

		if (uio->uio_offset > node->tn_size) {
			node->tn_size = uio->uio_offset;
			kflags |= NOTE_EXTEND;
		}
		kflags |= NOTE_WRITE;

		/*
		 * Always try to flush the page in the UIO_NOCOPY case.  This
		 * can come from the pageout daemon or during vnode eviction.
		 * It is not necessarily going to be marked IO_ASYNC/IO_SYNC.
		 *
		 * For the normal case we buwrite(), dirtying the underlying
		 * VM pages instead of dirtying the buffer and releasing the
		 * buffer as a clean buffer.  This allows tmpfs to use
		 * essentially all available memory to cache file data.
		 * If we used bdwrite() the buffer cache would wind up
		 * flushing the data to swap too quickly.
		 *
		 * But because tmpfs can seriously load the VM system we
		 * fall-back to using bdwrite() when free memory starts
		 * to get low.  This shifts the load away from the VM system
		 * and makes tmpfs act more like a normal filesystem with
		 * regards to disk activity.
		 *
		 * tmpfs pretty much fiddles directly with the VM
		 * system, don't let it exhaust it or we won't play
		 * nice with other processes.  Only do this if the
		 * VOP is coming from a normal read/write.  The VM system
		 * handles the case for UIO_NOCOPY.
		 */
		bp->b_flags |= B_CLUSTEROK;
		if (uio->uio_segflg == UIO_NOCOPY) {
			/*
			 * Flush from the pageout daemon, deal with
			 * potentially very heavy tmpfs write activity
			 * causing long stalls in the pageout daemon
			 * before pages get to free/cache.
			 *
			 * (a) Under severe pressure setting B_DIRECT will
			 *     cause a buffer release to try to free the
			 *     underlying pages.
			 *
			 * (b) Under modest memory pressure the B_RELBUF
			 *     alone is sufficient to get the pages moved
			 *     to the cache.  We could also force this by
			 *     setting B_NOTMETA but that might have other
			 *     unintended side-effects (e.g. setting
			 *     PG_NOTMETA on the VM page).
			 *
			 * Hopefully this will unblock the VM system more
			 * quickly under extreme tmpfs write load.
			 */
			if (vm_page_count_min(vm_page_free_hysteresis))
				bp->b_flags |= B_DIRECT;
			bp->b_flags |= B_AGE | B_RELBUF;
			bp->b_act_count = 0;	/* buffer->deactivate pgs */
			cluster_awrite(bp);
		} else if (vm_page_count_target()) {
			/*
			 * Normal (userland) write but we are low on memory,
			 * run the buffer the buffer cache.
			 */
			bp->b_act_count = 0;	/* buffer->deactivate pgs */
			bdwrite(bp);
		} else {
			/*
			 * Otherwise run the buffer directly through to the
			 * backing VM store.
			 */
			buwrite(bp);
			/*vm_wait_nominal();*/
		}

		if (bp->b_error) {
			kprintf("tmpfs_write bwrite error %d\n", bp->b_error);
			break;
		}
	}

	if (error) {
		if (extended) {
			(void)tmpfs_reg_resize(vp, oldsize, trivial);
			kflags &= ~NOTE_EXTEND;
		}
		goto done;
	}

	/*
	 * Currently we don't set the mtime on files modified via mmap()
	 * because we can't tell the difference between those modifications
	 * and an attempt by the pageout daemon to flush tmpfs pages to
	 * swap.
	 *
	 * This is because in order to defer flushes as long as possible
	 * buwrite() works by marking the underlying VM pages dirty in
	 * order to be able to dispose of the buffer cache buffer without
	 * flushing it.
	 */
	if (uio->uio_segflg != UIO_NOCOPY)
		node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED;
	if (extended)
		node->tn_status |= TMPFS_NODE_CHANGED;

	if (node->tn_mode & (S_ISUID | S_ISGID)) {
		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0))
			node->tn_mode &= ~(S_ISUID | S_ISGID);
	}
done:
	TMPFS_NODE_UNLOCK(node);
	if (kflags)
		tmpfs_knote(vp, kflags);

	return(error);
}
Пример #6
0
int
hammer_ioc_reblock(hammer_transaction_t trans, hammer_inode_t ip,
		   struct hammer_ioc_reblock *reblock)
{
	struct hammer_cursor cursor;
	hammer_btree_elm_t elm;
	int checkspace_count;
	int error;
	int seq;
	int slop;

	/*
	 * A fill level <= 20% is considered an emergency.  free_level is
	 * inverted from fill_level.
	 */
	if (reblock->free_level >= HAMMER_LARGEBLOCK_SIZE * 8 / 10)
		slop = HAMMER_CHKSPC_EMERGENCY;
	else
		slop = HAMMER_CHKSPC_REBLOCK;

	if ((reblock->key_beg.localization | reblock->key_end.localization) &
	    HAMMER_LOCALIZE_PSEUDOFS_MASK) {
		return(EINVAL);
	}
	if (reblock->key_beg.obj_id >= reblock->key_end.obj_id)
		return(EINVAL);
	if (reblock->free_level < 0)
		return(EINVAL);

	reblock->key_cur = reblock->key_beg;
	reblock->key_cur.localization &= HAMMER_LOCALIZE_MASK;
	reblock->key_cur.localization += ip->obj_localization;

	checkspace_count = 0;
	seq = trans->hmp->flusher.done;
retry:
	error = hammer_init_cursor(trans, &cursor, NULL, NULL);
	if (error) {
		hammer_done_cursor(&cursor);
		goto failed;
	}
	cursor.key_beg.localization = reblock->key_cur.localization;
	cursor.key_beg.obj_id = reblock->key_cur.obj_id;
	cursor.key_beg.key = HAMMER_MIN_KEY;
	cursor.key_beg.create_tid = 1;
	cursor.key_beg.delete_tid = 0;
	cursor.key_beg.rec_type = HAMMER_MIN_RECTYPE;
	cursor.key_beg.obj_type = 0;

	cursor.key_end.localization = (reblock->key_end.localization &
					HAMMER_LOCALIZE_MASK) +
				      ip->obj_localization;
	cursor.key_end.obj_id = reblock->key_end.obj_id;
	cursor.key_end.key = HAMMER_MAX_KEY;
	cursor.key_end.create_tid = HAMMER_MAX_TID - 1;
	cursor.key_end.delete_tid = 0;
	cursor.key_end.rec_type = HAMMER_MAX_RECTYPE;
	cursor.key_end.obj_type = 0;

	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
	cursor.flags |= HAMMER_CURSOR_BACKEND;
	cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE;

	/*
	 * This flag allows the btree scan code to return internal nodes,
	 * so we can reblock them in addition to the leafs.  Only specify it
	 * if we intend to reblock B-Tree nodes.
	 */
	if (reblock->head.flags & HAMMER_IOC_DO_BTREE)
		cursor.flags |= HAMMER_CURSOR_REBLOCKING;

	error = hammer_btree_first(&cursor);
	while (error == 0) {
		/*
		 * Internal or Leaf node
		 */
		KKASSERT(cursor.index < cursor.node->ondisk->count);
		elm = &cursor.node->ondisk->elms[cursor.index];
		reblock->key_cur.obj_id = elm->base.obj_id;
		reblock->key_cur.localization = elm->base.localization;

		/*
		 * Yield to more important tasks
		 */
		if ((error = hammer_signal_check(trans->hmp)) != 0)
			break;

		/*
		 * If there is insufficient free space it may be due to
		 * reserved bigblocks, which flushing might fix.
		 *
		 * We must force a retest in case the unlocked cursor is
		 * moved to the end of the leaf, or moved to an internal
		 * node.
		 *
		 * WARNING: See warnings in hammer_unlock_cursor() function.
		 */
		if (hammer_checkspace(trans->hmp, slop)) {
			if (++checkspace_count == 10) {
				error = ENOSPC;
				break;
			}
			hammer_unlock_cursor(&cursor);
			cursor.flags |= HAMMER_CURSOR_RETEST;
			hammer_flusher_wait(trans->hmp, seq);
			hammer_lock_cursor(&cursor);
			seq = hammer_flusher_async(trans->hmp, NULL);
			goto skip;
		}

		/*
		 * Acquiring the sync_lock prevents the operation from
		 * crossing a synchronization boundary.
		 *
		 * NOTE: cursor.node may have changed on return.
		 *
		 * WARNING: See warnings in hammer_unlock_cursor() function.
		 */
		hammer_sync_lock_sh(trans);
		error = hammer_reblock_helper(reblock, &cursor, elm);
		hammer_sync_unlock(trans);

		while (hammer_flusher_meta_halflimit(trans->hmp) ||
		       hammer_flusher_undo_exhausted(trans, 2)) {
			hammer_unlock_cursor(&cursor);
			hammer_flusher_wait(trans->hmp, seq);
			hammer_lock_cursor(&cursor);
			seq = hammer_flusher_async_one(trans->hmp);
		}

		/*
		 * Setup for iteration, our cursor flags may be modified by
		 * other threads while we are unlocked.
		 */
		cursor.flags |= HAMMER_CURSOR_ATEDISK;

		/*
		 * We allocate data buffers, which atm we don't track
		 * dirty levels for because we allow the kernel to write
		 * them.  But if we allocate too many we can still deadlock
		 * the buffer cache.
		 *
		 * WARNING: See warnings in hammer_unlock_cursor() function.
		 *	    (The cursor's node and element may change!)
		 */
		if (bd_heatup()) {
			hammer_unlock_cursor(&cursor);
			bwillwrite(HAMMER_XBUFSIZE);
			hammer_lock_cursor(&cursor);
		}
		/* XXX vm_wait_nominal(); */
skip:
		if (error == 0) {
			error = hammer_btree_iterate(&cursor);
		}
	}
	if (error == ENOENT)
		error = 0;
	hammer_done_cursor(&cursor);
	if (error == EWOULDBLOCK) {
		hammer_flusher_sync(trans->hmp);
		goto retry;
	}
	if (error == EDEADLK)
		goto retry;
	if (error == EINTR) {
		reblock->head.flags |= HAMMER_IOC_HEAD_INTR;
		error = 0;
	}
failed:
	reblock->key_cur.localization &= HAMMER_LOCALIZE_MASK;
	return(error);
}
Пример #7
0
static int
devfs_fo_write(struct file *fp, struct uio *uio,
		  struct ucred *cred, int flags)
{
	struct devfs_node *node;
	struct vnode *vp;
	int ioflag;
	int error;
	cdev_t dev;

	KASSERT(uio->uio_td == curthread,
		("uio_td %p is not p %p", uio->uio_td, curthread));

	vp = (struct vnode *)fp->f_data;
	if (vp == NULL || vp->v_type == VBAD)
		return EBADF;

	node = DEVFS_NODE(vp);

	if (vp->v_type == VREG)
		bwillwrite(uio->uio_resid);

	vp = (struct vnode *)fp->f_data;

	if ((dev = vp->v_rdev) == NULL)
		return EBADF;

	reference_dev(dev);

	if ((flags & O_FOFFSET) == 0)
		uio->uio_offset = fp->f_offset;

	ioflag = IO_UNIT;
	if (vp->v_type == VREG &&
	   ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) {
		ioflag |= IO_APPEND;
	}

	if (flags & O_FBLOCKING) {
		/* ioflag &= ~IO_NDELAY; */
	} else if (flags & O_FNONBLOCKING) {
		ioflag |= IO_NDELAY;
	} else if (fp->f_flag & FNONBLOCK) {
		ioflag |= IO_NDELAY;
	}
	if (flags & O_FBUFFERED) {
		/* ioflag &= ~IO_DIRECT; */
	} else if (flags & O_FUNBUFFERED) {
		ioflag |= IO_DIRECT;
	} else if (fp->f_flag & O_DIRECT) {
		ioflag |= IO_DIRECT;
	}
	if (flags & O_FASYNCWRITE) {
		/* ioflag &= ~IO_SYNC; */
	} else if (flags & O_FSYNCWRITE) {
		ioflag |= IO_SYNC;
	} else if (fp->f_flag & O_FSYNC) {
		ioflag |= IO_SYNC;
	}

	if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
		ioflag |= IO_SYNC;
	ioflag |= sequential_heuristic(uio, fp);

	error = dev_dwrite(dev, uio, ioflag, fp);

	release_dev(dev);
	if (node) {
		nanotime(&node->atime);
		nanotime(&node->mtime);
	}

	if ((flags & O_FOFFSET) == 0)
		fp->f_offset = uio->uio_offset;
	fp->f_nextoff = uio->uio_offset;

	return (error);
}