static void filemon_output(struct filemon *filemon, char *msg, size_t len) { struct uio auio; struct iovec aiov; int error; if (filemon->fp == NULL) return; aiov.iov_base = msg; aiov.iov_len = len; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = curthread; auio.uio_offset = (off_t) -1; if (filemon->fp->f_type == DTYPE_VNODE) bwillwrite(); error = fo_write(filemon->fp, &auio, curthread->td_ucred, 0, curthread); if (error != 0) filemon->error = error; }
/* * Write buffer to filp inside kernel */ int kern_file_write (cfs_file_t *fp, void *buf, size_t nbyte, loff_t *pos) { struct uio auio; struct iovec aiov; struct proc *p = current_proc(); long cnt, error = 0; int flags = 0; CFS_DECL_CONE_DATA; aiov.iov_base = (void *)(uintptr_t)buf; aiov.iov_len = nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; if (pos != NULL) { auio.uio_offset = *pos; /* * Liang: If don't set FOF_OFFSET, vn_write() * will use fp->f_offset as the the real offset. * Same in vn_read() */ flags |= FOF_OFFSET; } else auio.uio_offset = (off_t)-1; if (nbyte > INT_MAX) return (EINVAL); auio.uio_resid = nbyte; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_procp = p; cnt = nbyte; CFS_CONE_IN; if (fp->f_type == DTYPE_VNODE) bwillwrite(); /* empty stuff now */ if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) { if (auio.uio_resid != cnt && (error == ERESTART ||\ error == EINTR || error == EWOULDBLOCK)) error = 0; /* The socket layer handles SIGPIPE */ if (error == EPIPE && fp->f_type != DTYPE_SOCKET) psignal(p, SIGPIPE); } CFS_CONE_EX; if (error != 0) cnt = -error; else cnt -= auio.uio_resid; if (pos != NULL) *pos += cnt; return cnt; }
/* * Package up an I/O request on a vnode into a uio and do it. The I/O * request is split up into smaller chunks and we try to avoid saturating * the buffer cache while potentially holding a vnode locked, so we * check bwillwrite() before calling vn_rdwr(). We also call lwkt_user_yield() * to give other processes a chance to lock the vnode (either other processes * core'ing the same binary, or unrelated processes scanning the directory). * * MPSAFE */ int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *cred, int *aresid) { int error = 0; do { int chunk; /* * Force `offset' to a multiple of MAXBSIZE except possibly * for the first chunk, so that filesystems only need to * write full blocks except possibly for the first and last * chunks. */ chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; if (chunk > len) chunk = len; if (vp->v_type == VREG) { switch(rw) { case UIO_READ: bwillread(chunk); break; case UIO_WRITE: bwillwrite(chunk); break; } } error = vn_rdwr(rw, vp, base, chunk, offset, segflg, ioflg, cred, aresid); len -= chunk; /* aresid calc already includes length */ if (error) break; offset += chunk; base += chunk; lwkt_user_yield(); } while (len); if (aresid) *aresid += len; return (error); }
static int write_bytes(struct diffarg *da) { struct uio auio; struct iovec aiov; aiov.iov_base = (caddr_t)&da->da_ddr; aiov.iov_len = sizeof (da->da_ddr); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = aiov.iov_len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_offset = (off_t)-1; auio.uio_td = da->da_td; #ifdef _KERNEL if (da->da_fp->f_type == DTYPE_VNODE) bwillwrite(); return (fo_write(da->da_fp, &auio, da->da_td->td_ucred, 0, da->da_td)); #else fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); return (EOPNOTSUPP); #endif }
static int tmpfs_write (struct vop_write_args *ap) { struct buf *bp; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct thread *td = uio->uio_td; struct tmpfs_node *node; boolean_t extended; off_t oldsize; int error; off_t base_offset; size_t offset; size_t len; struct rlimit limit; int trivial = 0; int kflags = 0; int seqcount; error = 0; if (uio->uio_resid == 0) { return error; } node = VP_TO_TMPFS_NODE(vp); if (vp->v_type != VREG) return (EINVAL); seqcount = ap->a_ioflag >> 16; TMPFS_NODE_LOCK(node); oldsize = node->tn_size; if (ap->a_ioflag & IO_APPEND) uio->uio_offset = node->tn_size; /* * Check for illegal write offsets. */ if (uio->uio_offset + uio->uio_resid > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) { error = EFBIG; goto done; } /* * NOTE: Ignore if UIO does not come from a user thread (e.g. VN). */ if (vp->v_type == VREG && td != NULL && td->td_lwp != NULL) { error = kern_getrlimit(RLIMIT_FSIZE, &limit); if (error) goto done; if (uio->uio_offset + uio->uio_resid > limit.rlim_cur) { ksignal(td->td_proc, SIGXFSZ); error = EFBIG; goto done; } } /* * Extend the file's size if necessary */ extended = ((uio->uio_offset + uio->uio_resid) > node->tn_size); while (uio->uio_resid > 0) { /* * Don't completely blow out running buffer I/O * when being hit from the pageout daemon. */ if (uio->uio_segflg == UIO_NOCOPY && (ap->a_ioflag & IO_RECURSE) == 0) { bwillwrite(TMPFS_BLKSIZE); } /* * Use buffer cache I/O (via tmpfs_strategy) */ offset = (size_t)uio->uio_offset & TMPFS_BLKMASK64; base_offset = (off_t)uio->uio_offset - offset; len = TMPFS_BLKSIZE - offset; if (len > uio->uio_resid) len = uio->uio_resid; if ((uio->uio_offset + len) > node->tn_size) { trivial = (uio->uio_offset <= node->tn_size); error = tmpfs_reg_resize(vp, uio->uio_offset + len, trivial); if (error) break; } /* * Read to fill in any gaps. Theoretically we could * optimize this if the write covers the entire buffer * and is not a UIO_NOCOPY write, however this can lead * to a security violation exposing random kernel memory * (whatever junk was in the backing VM pages before). * * So just use bread() to do the right thing. */ error = bread(vp, base_offset, TMPFS_BLKSIZE, &bp); error = uiomovebp(bp, (char *)bp->b_data + offset, len, uio); if (error) { kprintf("tmpfs_write uiomove error %d\n", error); brelse(bp); break; } if (uio->uio_offset > node->tn_size) { node->tn_size = uio->uio_offset; kflags |= NOTE_EXTEND; } kflags |= NOTE_WRITE; /* * Always try to flush the page in the UIO_NOCOPY case. This * can come from the pageout daemon or during vnode eviction. * It is not necessarily going to be marked IO_ASYNC/IO_SYNC. * * For the normal case we buwrite(), dirtying the underlying * VM pages instead of dirtying the buffer and releasing the * buffer as a clean buffer. This allows tmpfs to use * essentially all available memory to cache file data. * If we used bdwrite() the buffer cache would wind up * flushing the data to swap too quickly. * * But because tmpfs can seriously load the VM system we * fall-back to using bdwrite() when free memory starts * to get low. This shifts the load away from the VM system * and makes tmpfs act more like a normal filesystem with * regards to disk activity. * * tmpfs pretty much fiddles directly with the VM * system, don't let it exhaust it or we won't play * nice with other processes. Only do this if the * VOP is coming from a normal read/write. The VM system * handles the case for UIO_NOCOPY. */ bp->b_flags |= B_CLUSTEROK; if (uio->uio_segflg == UIO_NOCOPY) { /* * Flush from the pageout daemon, deal with * potentially very heavy tmpfs write activity * causing long stalls in the pageout daemon * before pages get to free/cache. * * (a) Under severe pressure setting B_DIRECT will * cause a buffer release to try to free the * underlying pages. * * (b) Under modest memory pressure the B_RELBUF * alone is sufficient to get the pages moved * to the cache. We could also force this by * setting B_NOTMETA but that might have other * unintended side-effects (e.g. setting * PG_NOTMETA on the VM page). * * Hopefully this will unblock the VM system more * quickly under extreme tmpfs write load. */ if (vm_page_count_min(vm_page_free_hysteresis)) bp->b_flags |= B_DIRECT; bp->b_flags |= B_AGE | B_RELBUF; bp->b_act_count = 0; /* buffer->deactivate pgs */ cluster_awrite(bp); } else if (vm_page_count_target()) { /* * Normal (userland) write but we are low on memory, * run the buffer the buffer cache. */ bp->b_act_count = 0; /* buffer->deactivate pgs */ bdwrite(bp); } else { /* * Otherwise run the buffer directly through to the * backing VM store. */ buwrite(bp); /*vm_wait_nominal();*/ } if (bp->b_error) { kprintf("tmpfs_write bwrite error %d\n", bp->b_error); break; } } if (error) { if (extended) { (void)tmpfs_reg_resize(vp, oldsize, trivial); kflags &= ~NOTE_EXTEND; } goto done; } /* * Currently we don't set the mtime on files modified via mmap() * because we can't tell the difference between those modifications * and an attempt by the pageout daemon to flush tmpfs pages to * swap. * * This is because in order to defer flushes as long as possible * buwrite() works by marking the underlying VM pages dirty in * order to be able to dispose of the buffer cache buffer without * flushing it. */ if (uio->uio_segflg != UIO_NOCOPY) node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED; if (extended) node->tn_status |= TMPFS_NODE_CHANGED; if (node->tn_mode & (S_ISUID | S_ISGID)) { if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) node->tn_mode &= ~(S_ISUID | S_ISGID); } done: TMPFS_NODE_UNLOCK(node); if (kflags) tmpfs_knote(vp, kflags); return(error); }
int hammer_ioc_reblock(hammer_transaction_t trans, hammer_inode_t ip, struct hammer_ioc_reblock *reblock) { struct hammer_cursor cursor; hammer_btree_elm_t elm; int checkspace_count; int error; int seq; int slop; /* * A fill level <= 20% is considered an emergency. free_level is * inverted from fill_level. */ if (reblock->free_level >= HAMMER_LARGEBLOCK_SIZE * 8 / 10) slop = HAMMER_CHKSPC_EMERGENCY; else slop = HAMMER_CHKSPC_REBLOCK; if ((reblock->key_beg.localization | reblock->key_end.localization) & HAMMER_LOCALIZE_PSEUDOFS_MASK) { return(EINVAL); } if (reblock->key_beg.obj_id >= reblock->key_end.obj_id) return(EINVAL); if (reblock->free_level < 0) return(EINVAL); reblock->key_cur = reblock->key_beg; reblock->key_cur.localization &= HAMMER_LOCALIZE_MASK; reblock->key_cur.localization += ip->obj_localization; checkspace_count = 0; seq = trans->hmp->flusher.done; retry: error = hammer_init_cursor(trans, &cursor, NULL, NULL); if (error) { hammer_done_cursor(&cursor); goto failed; } cursor.key_beg.localization = reblock->key_cur.localization; cursor.key_beg.obj_id = reblock->key_cur.obj_id; cursor.key_beg.key = HAMMER_MIN_KEY; cursor.key_beg.create_tid = 1; cursor.key_beg.delete_tid = 0; cursor.key_beg.rec_type = HAMMER_MIN_RECTYPE; cursor.key_beg.obj_type = 0; cursor.key_end.localization = (reblock->key_end.localization & HAMMER_LOCALIZE_MASK) + ip->obj_localization; cursor.key_end.obj_id = reblock->key_end.obj_id; cursor.key_end.key = HAMMER_MAX_KEY; cursor.key_end.create_tid = HAMMER_MAX_TID - 1; cursor.key_end.delete_tid = 0; cursor.key_end.rec_type = HAMMER_MAX_RECTYPE; cursor.key_end.obj_type = 0; cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; cursor.flags |= HAMMER_CURSOR_BACKEND; cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE; /* * This flag allows the btree scan code to return internal nodes, * so we can reblock them in addition to the leafs. Only specify it * if we intend to reblock B-Tree nodes. */ if (reblock->head.flags & HAMMER_IOC_DO_BTREE) cursor.flags |= HAMMER_CURSOR_REBLOCKING; error = hammer_btree_first(&cursor); while (error == 0) { /* * Internal or Leaf node */ KKASSERT(cursor.index < cursor.node->ondisk->count); elm = &cursor.node->ondisk->elms[cursor.index]; reblock->key_cur.obj_id = elm->base.obj_id; reblock->key_cur.localization = elm->base.localization; /* * Yield to more important tasks */ if ((error = hammer_signal_check(trans->hmp)) != 0) break; /* * If there is insufficient free space it may be due to * reserved bigblocks, which flushing might fix. * * We must force a retest in case the unlocked cursor is * moved to the end of the leaf, or moved to an internal * node. * * WARNING: See warnings in hammer_unlock_cursor() function. */ if (hammer_checkspace(trans->hmp, slop)) { if (++checkspace_count == 10) { error = ENOSPC; break; } hammer_unlock_cursor(&cursor); cursor.flags |= HAMMER_CURSOR_RETEST; hammer_flusher_wait(trans->hmp, seq); hammer_lock_cursor(&cursor); seq = hammer_flusher_async(trans->hmp, NULL); goto skip; } /* * Acquiring the sync_lock prevents the operation from * crossing a synchronization boundary. * * NOTE: cursor.node may have changed on return. * * WARNING: See warnings in hammer_unlock_cursor() function. */ hammer_sync_lock_sh(trans); error = hammer_reblock_helper(reblock, &cursor, elm); hammer_sync_unlock(trans); while (hammer_flusher_meta_halflimit(trans->hmp) || hammer_flusher_undo_exhausted(trans, 2)) { hammer_unlock_cursor(&cursor); hammer_flusher_wait(trans->hmp, seq); hammer_lock_cursor(&cursor); seq = hammer_flusher_async_one(trans->hmp); } /* * Setup for iteration, our cursor flags may be modified by * other threads while we are unlocked. */ cursor.flags |= HAMMER_CURSOR_ATEDISK; /* * We allocate data buffers, which atm we don't track * dirty levels for because we allow the kernel to write * them. But if we allocate too many we can still deadlock * the buffer cache. * * WARNING: See warnings in hammer_unlock_cursor() function. * (The cursor's node and element may change!) */ if (bd_heatup()) { hammer_unlock_cursor(&cursor); bwillwrite(HAMMER_XBUFSIZE); hammer_lock_cursor(&cursor); } /* XXX vm_wait_nominal(); */ skip: if (error == 0) { error = hammer_btree_iterate(&cursor); } } if (error == ENOENT) error = 0; hammer_done_cursor(&cursor); if (error == EWOULDBLOCK) { hammer_flusher_sync(trans->hmp); goto retry; } if (error == EDEADLK) goto retry; if (error == EINTR) { reblock->head.flags |= HAMMER_IOC_HEAD_INTR; error = 0; } failed: reblock->key_cur.localization &= HAMMER_LOCALIZE_MASK; return(error); }
static int devfs_fo_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags) { struct devfs_node *node; struct vnode *vp; int ioflag; int error; cdev_t dev; KASSERT(uio->uio_td == curthread, ("uio_td %p is not p %p", uio->uio_td, curthread)); vp = (struct vnode *)fp->f_data; if (vp == NULL || vp->v_type == VBAD) return EBADF; node = DEVFS_NODE(vp); if (vp->v_type == VREG) bwillwrite(uio->uio_resid); vp = (struct vnode *)fp->f_data; if ((dev = vp->v_rdev) == NULL) return EBADF; reference_dev(dev); if ((flags & O_FOFFSET) == 0) uio->uio_offset = fp->f_offset; ioflag = IO_UNIT; if (vp->v_type == VREG && ((fp->f_flag & O_APPEND) || (flags & O_FAPPEND))) { ioflag |= IO_APPEND; } if (flags & O_FBLOCKING) { /* ioflag &= ~IO_NDELAY; */ } else if (flags & O_FNONBLOCKING) { ioflag |= IO_NDELAY; } else if (fp->f_flag & FNONBLOCK) { ioflag |= IO_NDELAY; } if (flags & O_FBUFFERED) { /* ioflag &= ~IO_DIRECT; */ } else if (flags & O_FUNBUFFERED) { ioflag |= IO_DIRECT; } else if (fp->f_flag & O_DIRECT) { ioflag |= IO_DIRECT; } if (flags & O_FASYNCWRITE) { /* ioflag &= ~IO_SYNC; */ } else if (flags & O_FSYNCWRITE) { ioflag |= IO_SYNC; } else if (fp->f_flag & O_FSYNC) { ioflag |= IO_SYNC; } if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)) ioflag |= IO_SYNC; ioflag |= sequential_heuristic(uio, fp); error = dev_dwrite(dev, uio, ioflag, fp); release_dev(dev); if (node) { nanotime(&node->atime); nanotime(&node->mtime); } if ((flags & O_FOFFSET) == 0) fp->f_offset = uio->uio_offset; fp->f_nextoff = uio->uio_offset; return (error); }