int sysvbfs_strategy(void *arg) { struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *a = arg; struct buf *b = a->a_bp; struct vnode *v = a->a_vp; struct sysvbfs_node *bnode = v->v_data; struct sysvbfs_mount *bmp = bnode->bmp; int error; DPRINTF("%s:\n", __func__); KDASSERT(v->v_type == VREG); if (b->b_blkno == b->b_lblkno) { error = VOP_BMAP(v, b->b_lblkno, NULL, &b->b_blkno, NULL); if (error) { b->b_error = error; biodone(b); return error; } if ((long)b->b_blkno == -1) clrbuf(b); } if ((long)b->b_blkno == -1) { biodone(b); return 0; } return VOP_STRATEGY(bmp->devvp, b); }
/* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ int cd9660_strategy(void *v) { struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap = v; struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; struct iso_node *ip; int error; ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("cd9660_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL); if (error) { bp->b_error = error; biodone(bp); return (error); } if ((long)bp->b_blkno == -1) clrbuf(bp); } if ((long)bp->b_blkno == -1) { biodone(bp); return (0); } vp = ip->i_mnt->im_devvp; return (VOP_STRATEGY(vp, bp)); }
int v7fs_strategy(void *v) { struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *a = v; struct buf *b = a->a_bp; struct vnode *vp = a->a_vp; struct v7fs_node *v7node = vp->v_data; struct v7fs_mount *v7fsmount = v7node->v7fsmount; int error; DPRINTF("%p\n", vp); KDASSERT(vp->v_type == VREG); if (b->b_blkno == b->b_lblkno) { error = VOP_BMAP(vp, b->b_lblkno, NULL, &b->b_blkno, NULL); if (error) { b->b_error = error; biodone(b); return error; } if ((long)b->b_blkno == -1) clrbuf(b); } if ((long)b->b_blkno == -1) { biodone(b); return 0; } return VOP_STRATEGY(v7fsmount->devvp, b); }
/* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ int filecore_strategy(void *v) { struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap = v; struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; struct filecore_node *ip; int error; ip = VTOI(vp); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL); if (error) { bp->b_error = error; biodone(bp); return (error); } if ((long)bp->b_blkno == -1) clrbuf(bp); } if ((long)bp->b_blkno == -1) { biodone(bp); return (0); } vp = ip->i_devvp; return (VOP_STRATEGY(vp, bp)); }
/* * calculate the linear (byte) disk address of specified virtual * file address */ static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run) { int bsize; int err; daddr_t vblock; daddr_t voffset; if (address < 0) return -1; if (vp->v_iflag & VI_DOOMED) return -1; bsize = vp->v_mount->mnt_stat.f_iosize; vblock = address / bsize; voffset = address % bsize; err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL); if (err == 0) { if (*rtaddress != -1) *rtaddress += voffset / DEV_BSIZE; if (run) { *run += 1; *run *= bsize/PAGE_SIZE; *run -= voffset/PAGE_SIZE; } } return (err); }
static int ReadMultipleNodes( BTScanState *theScanStatePtr ) { int myErr = E_NONE; BTreeControlBlockPtr myBTreeCBPtr; daddr_t myPhyBlockNum; u_int32_t myBufferSize; struct vnode * myDevPtr; int myBlockRun; u_int32_t myBlocksInBufferCount; // release old buffer if we have one if ( theScanStatePtr->bufferPtr != NULL ) { theScanStatePtr->bufferPtr->b_flags |= (B_INVAL | B_AGE); brelse( theScanStatePtr->bufferPtr ); theScanStatePtr->bufferPtr = NULL; theScanStatePtr->currentNodePtr = NULL; } myBTreeCBPtr = theScanStatePtr->btcb; // map logical block in catalog btree file to physical block on volume myErr = VOP_BMAP( myBTreeCBPtr->fileRefNum, theScanStatePtr->nodeNum, &myDevPtr, &myPhyBlockNum, &myBlockRun ); if ( myErr != E_NONE ) { goto ExitThisRoutine; } // bmap block run gives us the remaining number of valid blocks (number of blocks // minus the first). so if there are 10 valid blocks our run number will be 9. // blocks, in our case is the same as nodes (both are 4K) myBlocksInBufferCount = (theScanStatePtr->bufferSize / myBTreeCBPtr->nodeSize ); myBufferSize = theScanStatePtr->bufferSize; if ( (myBlockRun + 1) < myBlocksInBufferCount ) { myBufferSize = (myBlockRun + 1) * myBTreeCBPtr->nodeSize; } // now read blocks from the device myErr = bread( myDevPtr, myPhyBlockNum, myBufferSize, NOCRED, &theScanStatePtr->bufferPtr ); if ( myErr != E_NONE ) { goto ExitThisRoutine; } theScanStatePtr->nodesLeftInBuffer = theScanStatePtr->bufferPtr->b_bcount / theScanStatePtr->btcb->nodeSize; theScanStatePtr->currentNodePtr = (BTNodeDescriptor *) theScanStatePtr->bufferPtr->b_data; ExitThisRoutine: return myErr; } /* ReadMultipleNodes */
static int unionfs_bmap(void *v) { struct vop_bmap_args *ap = v; struct unionfs_node *unp; struct vnode *tvp; unp = VTOUNIONFS(ap->a_vp); tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp); return VOP_BMAP(tvp, ap->a_bn, ap->a_vpp, ap->a_bnp, ap->a_runp); }
int RUMP_VOP_BMAP(struct vnode *vp, int64_t bn, struct vnode **vpp, int64_t *bnp, int *runp) { int error; rump_schedule(); error = VOP_BMAP(vp, bn, vpp, bnp, runp); rump_unschedule(); return error; }
/* * Return whether the vnode pager has the requested page. Return the * number of disk-contiguous pages before and after the requested page, * not including the requested page. */ static boolean_t vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex) { struct vnode *vp = object->handle; off_t loffset; off_t doffset; int voff; int bsize; int error; /* * If no vp or vp is doomed or marked transparent to VM, we do not * have the page. */ if ((vp == NULL) || (vp->v_flag & VRECLAIMED)) return FALSE; /* * If filesystem no longer mounted or offset beyond end of file we do * not have the page. */ loffset = IDX_TO_OFF(pindex); if (vp->v_mount == NULL || loffset >= vp->v_filesize) return FALSE; bsize = vp->v_mount->mnt_stat.f_iosize; voff = loffset % bsize; /* * XXX * * BMAP returns byte counts before and after, where after * is inclusive of the base page. haspage must return page * counts before and after where after does not include the * base page. * * BMAP is allowed to return a *after of 0 for backwards * compatibility. The base page is still considered valid if * no error is returned. */ error = VOP_BMAP(vp, loffset - voff, &doffset, NULL, NULL, 0); if (error) return TRUE; if (doffset == NOOFFSET) return FALSE; return TRUE; }
/* * Just call the device strategy routine */ int adosfs_strategy(void *v) { struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *sp = v; struct buf *bp; struct anode *ap; struct vnode *vp; int error; #ifdef ADOSFS_DIAGNOSTIC advopprint(sp); #endif bp = sp->a_bp; if (bp->b_vp == NULL) { bp->b_error = EIO; biodone(bp); error = EIO; goto reterr; } vp = sp->a_vp; ap = VTOA(vp); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL); if (error) { bp->b_flags = error; biodone(bp); goto reterr; } } if ((long)bp->b_blkno == -1) { biodone(bp); error = 0; goto reterr; } vp = ap->amp->devvp; error = VOP_STRATEGY(vp, bp); reterr: #ifdef ADOSFS_DIAGNOSTIC printf(" %d)", error); #endif return(error); }
static int _xfs_strategy( struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap) { daddr_t blkno; struct buf *bp;; struct bufobj *bo; struct vnode *vp; struct xfs_mount *xmp; int error; bp = ap->a_bp; vp = ap->a_vp; KASSERT(ap->a_vp == ap->a_bp->b_vp, ("%s(%p != %p)", __func__, ap->a_vp, ap->a_bp->b_vp)); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, NULL, &blkno, NULL, NULL); bp->b_blkno = blkno; bp->b_iooffset = (blkno << BBSHIFT); if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (0); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } xmp = XFS_VFSTOM(MNTTOVFS(vp->v_mount)); bo = &xmp->m_ddev_targp->specvp->v_bufobj; bo->bo_ops->bop_strategy(bo, bp); return (0); }
/* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the VOP_BMAP operation may not * deadlock on memory. See hpfs_bmap() for details. XXXXXXX (not impl) * * hpfs_strategy(struct vnode *a_vp, struct bio *a_bio) */ int hpfs_strategy(struct vop_strategy_args *ap) { struct bio *bio = ap->a_bio; struct bio *nbio; struct buf *bp = bio->bio_buf; struct vnode *vp = ap->a_vp; struct hpfsnode *hp; int error; dprintf(("hpfs_strategy(): \n")); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("hpfs_strategy: spec"); nbio = push_bio(bio); if (nbio->bio_offset == NOOFFSET) { error = VOP_BMAP(vp, bio->bio_offset, &nbio->bio_offset, NULL, NULL, bp->b_cmd); if (error) { kprintf("hpfs_strategy: VOP_BMAP FAILED %d\n", error); bp->b_error = error; bp->b_flags |= B_ERROR; /* I/O was never started on nbio, must biodone(bio) */ biodone(bio); return (error); } if (nbio->bio_offset == NOOFFSET) vfs_bio_clrbuf(bp); } if (nbio->bio_offset == NOOFFSET) { /* I/O was never started on nbio, must biodone(bio) */ biodone(bio); return (0); } hp = VTOHP(ap->a_vp); vn_strategy(hp->h_devvp, nbio); return (0); }
/* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ int exfs_strategy(void *v) { struct vop_strategy_args *ap = v; struct buf *bp = ap->a_bp; struct vnode *vp = bp->b_vp; struct iso_node *ip; int error; int s; printf("vowel v exfs_strategy\n"); ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("exfs_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL); if (error) { bp->b_error = error; bp->b_flags |= B_ERROR; s = splbio(); biodone(bp); splx(s); return (error); } if ((long)bp->b_blkno == -1) clrbuf(bp); } if ((long)bp->b_blkno == -1) { s = splbio(); biodone(bp); splx(s); return (0); } vp = ip->i_devvp; bp->b_dev = vp->v_rdev; (vp->v_op->vop_strategy)(ap); return (0); }
static bool vnode_strategy_probe(struct vnd_softc *vnd) { int error; daddr_t nbn; if (!vnode_has_strategy(vnd)) return false; /* Convert the first logical block number to its * physical block number. */ error = 0; vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_BMAP(vnd->sc_vp, 0, NULL, &nbn, NULL); VOP_UNLOCK(vnd->sc_vp); /* Test if that worked. */ if (error == 0 && (long)nbn == -1) return false; return true; }
/* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. */ int ufs_strategy(void *v) { struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap = v; struct buf *bp; struct vnode *vp; struct inode *ip; struct mount *mp; int error; bp = ap->a_bp; vp = ap->a_vp; ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("ufs_strategy: spec"); KASSERT(bp->b_bcount != 0); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL); if (error) { bp->b_error = error; biodone(bp); return (error); } if (bp->b_blkno == -1) /* no valid data */ clrbuf(bp); } if (bp->b_blkno < 0) { /* block is not on disk */ biodone(bp); return (0); } vp = ip->i_devvp; error = VOP_STRATEGY(vp, bp); if (error) return error; if (!BUF_ISREAD(bp)) return 0; mp = wapbl_vptomp(vp); if (mp == NULL || mp->mnt_wapbl_replay == NULL || !WAPBL_REPLAY_ISOPEN(mp) || !WAPBL_REPLAY_CAN_READ(mp, bp->b_blkno, bp->b_bcount)) return 0; error = biowait(bp); if (error) return error; error = WAPBL_REPLAY_READ(mp, bp->b_data, bp->b_blkno, bp->b_bcount); if (error) { mutex_enter(&bufcache_lock); SET(bp->b_cflags, BC_INVAL); mutex_exit(&bufcache_lock); } return error; }
static boolean_t vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { struct vnode *vp = object->handle; daddr_t bn; int err; daddr_t reqblock; int poff; int bsize; int pagesperblock, blocksperpage; VM_OBJECT_ASSERT_WLOCKED(object); /* * If no vp or vp is doomed or marked transparent to VM, we do not * have the page. */ if (vp == NULL || vp->v_iflag & VI_DOOMED) return FALSE; /* * If the offset is beyond end of file we do * not have the page. */ if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size) return FALSE; bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; blocksperpage = 0; if (pagesperblock > 0) { reqblock = pindex / pagesperblock; } else { blocksperpage = (PAGE_SIZE / bsize); reqblock = pindex * blocksperpage; } VM_OBJECT_WUNLOCK(object); err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before); VM_OBJECT_WLOCK(object); if (err) return TRUE; if (bn == -1) return FALSE; if (pagesperblock > 0) { poff = pindex - (reqblock * pagesperblock); if (before) { *before *= pagesperblock; *before += poff; } if (after) { /* * The BMAP vop can report a partial block in the * 'after', but must not report blocks after EOF. * Assert the latter, and truncate 'after' in case * of the former. */ KASSERT((reqblock + *after) * pagesperblock < roundup2(object->size, pagesperblock), ("%s: reqblock %jd after %d size %ju", __func__, (intmax_t )reqblock, *after, (uintmax_t )object->size)); *after *= pagesperblock; *after += pagesperblock - (poff + 1); if (pindex + *after >= object->size) *after = object->size - 1 - pindex; } } else { if (before) { *before /= blocksperpage; } if (after) { *after /= blocksperpage; } } return TRUE; }
/* * Handes the read/write request given in 'bp' using the vnode's VOP_BMAP * and VOP_STRATEGY operations. * * 'obp' is a pointer to the original request fed to the vnd device. */ static void handle_with_strategy(struct vnd_softc *vnd, const struct buf *obp, struct buf *bp) { int bsize, error, flags, skipped; size_t resid, sz; off_t bn, offset; struct vnode *vp; flags = obp->b_flags; if (!(flags & B_READ)) { vp = bp->b_vp; mutex_enter(vp->v_interlock); vp->v_numoutput++; mutex_exit(vp->v_interlock); } /* convert to a byte offset within the file. */ bn = obp->b_rawblkno * vnd->sc_dkdev.dk_label->d_secsize; bsize = vnd->sc_vp->v_mount->mnt_stat.f_iosize; skipped = 0; /* * Break the request into bsize pieces and feed them * sequentially using VOP_BMAP/VOP_STRATEGY. * We do it this way to keep from flooding NFS servers if we * are connected to an NFS file. This places the burden on * the client rather than the server. */ error = 0; bp->b_resid = bp->b_bcount; for (offset = 0, resid = bp->b_resid; resid; resid -= sz, offset += sz) { struct buf *nbp; daddr_t nbn; int off, nra; nra = 0; vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_BMAP(vnd->sc_vp, bn / bsize, &vp, &nbn, &nra); VOP_UNLOCK(vnd->sc_vp); if (error == 0 && (long)nbn == -1) error = EIO; /* * If there was an error or a hole in the file...punt. * Note that we may have to wait for any operations * that we have already fired off before releasing * the buffer. * * XXX we could deal with holes here but it would be * a hassle (in the write case). */ if (error) { skipped += resid; break; } #ifdef DEBUG if (!dovndcluster) nra = 0; #endif off = bn % bsize; sz = MIN(((off_t)1 + nra) * bsize - off, resid); #ifdef DEBUG if (vnddebug & VDB_IO) printf("vndstrategy: vp %p/%p bn 0x%qx/0x%" PRIx64 " sz 0x%zx\n", vnd->sc_vp, vp, (long long)bn, nbn, sz); #endif nbp = getiobuf(vp, true); nestiobuf_setup(bp, nbp, offset, sz); nbp->b_blkno = nbn + btodb(off); #if 0 /* XXX #ifdef DEBUG */ if (vnddebug & VDB_IO) printf("vndstart(%ld): bp %p vp %p blkno " "0x%" PRIx64 " flags %x addr %p cnt 0x%x\n", (long) (vnd-vnd_softc), &nbp->vb_buf, nbp->vb_buf.b_vp, nbp->vb_buf.b_blkno, nbp->vb_buf.b_flags, nbp->vb_buf.b_data, nbp->vb_buf.b_bcount); #endif VOP_STRATEGY(vp, nbp); bn += sz; } nestiobuf_done(bp, skipped, error); }
int lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt) { BLOCK_INFO *blkp; IFILE *ifp; struct buf *bp; struct inode *ip = NULL; struct lfs *fs; struct mount *mntp; struct ulfsmount *ump; struct vnode *vp; ino_t lastino; daddr_t v_daddr; int cnt, error; int numrefed = 0; lfs_cleaner_pid = p->p_pid; if ((mntp = vfs_getvfs(fsidp)) == NULL) return (ENOENT); ump = VFSTOULFS(mntp); if ((error = vfs_busy(mntp, NULL)) != 0) return (error); cnt = blkcnt; fs = VFSTOULFS(mntp)->um_lfs; error = 0; /* these were inside the initialization for the for loop */ v_daddr = LFS_UNUSED_DADDR; lastino = LFS_UNUSED_INUM; for (blkp = blkiov; cnt--; ++blkp) { /* * Get the IFILE entry (only once) and see if the file still * exists. */ if (lastino != blkp->bi_inode) { /* * Finish the old file, if there was one. The presence * of a usable vnode in vp is signaled by a valid * v_daddr. */ if (v_daddr != LFS_UNUSED_DADDR) { lfs_vunref(vp); if (VTOI(vp)->i_lfs_iflags & LFSI_BMAP) { mutex_enter(vp->v_interlock); if (vget(vp, LK_NOWAIT) == 0) { if (! vrecycle(vp)) vrele(vp); } } numrefed--; } /* * Start a new file */ lastino = blkp->bi_inode; if (blkp->bi_inode == LFS_IFILE_INUM) v_daddr = fs->lfs_idaddr; else { LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); v_daddr = ifp->if_daddr; brelse(bp, 0); } if (v_daddr == LFS_UNUSED_DADDR) { blkp->bi_daddr = LFS_UNUSED_DADDR; continue; } /* * A regular call to VFS_VGET could deadlock * here. Instead, we try an unlocked access. */ mutex_enter(&ulfs_ihash_lock); vp = ulfs_ihashlookup(ump->um_dev, blkp->bi_inode); if (vp != NULL && !(vp->v_iflag & VI_XLOCK)) { ip = VTOI(vp); mutex_enter(vp->v_interlock); mutex_exit(&ulfs_ihash_lock); if (lfs_vref(vp)) { v_daddr = LFS_UNUSED_DADDR; continue; } numrefed++; } else { mutex_exit(&ulfs_ihash_lock); /* * Don't VFS_VGET if we're being unmounted, * since we hold vfs_busy(). */ if (mntp->mnt_iflag & IMNT_UNMOUNT) { v_daddr = LFS_UNUSED_DADDR; continue; } error = VFS_VGET(mntp, blkp->bi_inode, &vp); if (error) { DLOG((DLOG_CLEAN, "lfs_bmapv: vget ino" "%d failed with %d", blkp->bi_inode,error)); v_daddr = LFS_UNUSED_DADDR; continue; } else { KASSERT(VOP_ISLOCKED(vp)); VTOI(vp)->i_lfs_iflags |= LFSI_BMAP; VOP_UNLOCK(vp); numrefed++; } } ip = VTOI(vp); } else if (v_daddr == LFS_UNUSED_DADDR) { /* * This can only happen if the vnode is dead. * Keep going. Note that we DO NOT set the * bi_addr to anything -- if we failed to get * the vnode, for example, we want to assume * conservatively that all of its blocks *are* * located in the segment in question. * lfs_markv will throw them out if we are * wrong. */ /* blkp->bi_daddr = LFS_UNUSED_DADDR; */ continue; } /* Past this point we are guaranteed that vp, ip are valid. */ if (blkp->bi_lbn == LFS_UNUSED_LBN) { /* * We just want the inode address, which is * conveniently in v_daddr. */ blkp->bi_daddr = v_daddr; } else { daddr_t bi_daddr; /* XXX ondisk32 */ error = VOP_BMAP(vp, blkp->bi_lbn, NULL, &bi_daddr, NULL); if (error) { blkp->bi_daddr = LFS_UNUSED_DADDR; continue; } blkp->bi_daddr = LFS_DBTOFSB(fs, bi_daddr); /* Fill in the block size, too */ if (blkp->bi_lbn >= 0) blkp->bi_size = lfs_blksize(fs, ip, blkp->bi_lbn); else blkp->bi_size = fs->lfs_bsize; } } /* * Finish the old file, if there was one. The presence * of a usable vnode in vp is signaled by a valid v_daddr. */ if (v_daddr != LFS_UNUSED_DADDR) { lfs_vunref(vp); /* Recycle as above. */ if (ip->i_lfs_iflags & LFSI_BMAP) { mutex_enter(vp->v_interlock); if (vget(vp, LK_NOWAIT) == 0) { if (! vrecycle(vp)) vrele(vp); } } numrefed--; } #ifdef DIAGNOSTIC if (numrefed != 0) panic("lfs_bmapv: numrefed=%d", numrefed); #endif vfs_unbusy(mntp, false, NULL); return 0; }
int lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt) { BLOCK_INFO *blkp; IFILE *ifp; struct buf *bp; struct inode *ip = NULL; struct lfs *fs; struct mount *mntp; struct vnode *vp = NULL; ino_t lastino; daddr_t b_daddr, v_daddr; int cnt, error; int do_again = 0; int numrefed = 0; ino_t maxino; size_t obsize; /* number of blocks/inodes that we have already bwrite'ed */ int nblkwritten, ninowritten; if ((mntp = vfs_getvfs(fsidp)) == NULL) return (ENOENT); fs = VFSTOULFS(mntp)->um_lfs; if (fs->lfs_ronly) return EROFS; maxino = (lfs_fragstoblks(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks) - fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb; cnt = blkcnt; if ((error = vfs_busy(mntp, NULL)) != 0) return (error); /* * This seglock is just to prevent the fact that we might have to sleep * from allowing the possibility that our blocks might become * invalid. * * It is also important to note here that unless we specify SEGM_CKP, * any Ifile blocks that we might be asked to clean will never get * to the disk. */ lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); /* Mark blocks/inodes dirty. */ error = 0; /* these were inside the initialization for the for loop */ v_daddr = LFS_UNUSED_DADDR; lastino = LFS_UNUSED_INUM; nblkwritten = ninowritten = 0; for (blkp = blkiov; cnt--; ++blkp) { /* Bounds-check incoming data, avoid panic for failed VGET */ if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) { error = EINVAL; goto err3; } /* * Get the IFILE entry (only once) and see if the file still * exists. */ if (lastino != blkp->bi_inode) { /* * Finish the old file, if there was one. The presence * of a usable vnode in vp is signaled by a valid v_daddr. */ if (v_daddr != LFS_UNUSED_DADDR) { lfs_vunref(vp); numrefed--; } /* * Start a new file */ lastino = blkp->bi_inode; if (blkp->bi_inode == LFS_IFILE_INUM) v_daddr = fs->lfs_idaddr; else { LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); /* XXX fix for force write */ v_daddr = ifp->if_daddr; brelse(bp, 0); } if (v_daddr == LFS_UNUSED_DADDR) continue; /* Get the vnode/inode. */ error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr, &vp, (blkp->bi_lbn == LFS_UNUSED_LBN ? blkp->bi_bp : NULL)); if (!error) { numrefed++; } if (error) { DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget" " failed with %d (ino %d, segment %d)\n", error, blkp->bi_inode, lfs_dtosn(fs, blkp->bi_daddr))); /* * If we got EAGAIN, that means that the * Inode was locked. This is * recoverable: just clean the rest of * this segment, and let the cleaner try * again with another. (When the * cleaner runs again, this segment will * sort high on the list, since it is * now almost entirely empty.) But, we * still set v_daddr = LFS_UNUSED_ADDR * so as not to test this over and over * again. */ if (error == EAGAIN) { error = 0; do_again++; } #ifdef DIAGNOSTIC else if (error != ENOENT) panic("lfs_markv VFS_VGET FAILED"); #endif /* lastino = LFS_UNUSED_INUM; */ v_daddr = LFS_UNUSED_DADDR; vp = NULL; ip = NULL; continue; } ip = VTOI(vp); ninowritten++; } else if (v_daddr == LFS_UNUSED_DADDR) { /* * This can only happen if the vnode is dead (or * in any case we can't get it...e.g., it is * inlocked). Keep going. */ continue; } /* Past this point we are guaranteed that vp, ip are valid. */ /* Can't clean VU_DIROP directories in case of truncation */ /* XXX - maybe we should mark removed dirs specially? */ if (vp->v_type == VDIR && (vp->v_uflag & VU_DIROP)) { do_again++; continue; } /* If this BLOCK_INFO didn't contain a block, keep going. */ if (blkp->bi_lbn == LFS_UNUSED_LBN) { /* XXX need to make sure that the inode gets written in this case */ /* XXX but only write the inode if it's the right one */ if (blkp->bi_inode != LFS_IFILE_INUM) { LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); if (ifp->if_daddr == blkp->bi_daddr) { mutex_enter(&lfs_lock); LFS_SET_UINO(ip, IN_CLEANING); mutex_exit(&lfs_lock); } brelse(bp, 0); } continue; } b_daddr = 0; if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) || LFS_DBTOFSB(fs, b_daddr) != blkp->bi_daddr) { if (lfs_dtosn(fs, LFS_DBTOFSB(fs, b_daddr)) == lfs_dtosn(fs, blkp->bi_daddr)) { DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %llx vs %llx\n", (long long)blkp->bi_daddr, (long long)LFS_DBTOFSB(fs, b_daddr))); } do_again++; continue; } /* * Check block sizes. The blocks being cleaned come from * disk, so they should have the same size as their on-disk * counterparts. */ if (blkp->bi_lbn >= 0) obsize = lfs_blksize(fs, ip, blkp->bi_lbn); else obsize = fs->lfs_bsize; /* Check for fragment size change */ if (blkp->bi_lbn >= 0 && blkp->bi_lbn < ULFS_NDADDR) { obsize = ip->i_lfs_fragsize[blkp->bi_lbn]; } if (obsize != blkp->bi_size) { DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %lld wrong" " size (%ld != %d), try again\n", blkp->bi_inode, (long long)blkp->bi_lbn, (long) obsize, blkp->bi_size)); do_again++; continue; } /* * If we get to here, then we are keeping the block. If * it is an indirect block, we want to actually put it * in the buffer cache so that it can be updated in the * finish_meta section. If it's not, we need to * allocate a fake buffer so that writeseg can perform * the copyin and write the buffer. */ if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) { /* Data Block */ bp = lfs_fakebuf(fs, vp, blkp->bi_lbn, blkp->bi_size, blkp->bi_bp); /* Pretend we used bread() to get it */ bp->b_blkno = LFS_FSBTODB(fs, blkp->bi_daddr); } else { /* Indirect block or ifile */ if (blkp->bi_size != fs->lfs_bsize && ip->i_number != LFS_IFILE_INUM) panic("lfs_markv: partial indirect block?" " size=%d\n", blkp->bi_size); bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0); if (!(bp->b_oflags & (BO_DONE|BO_DELWRI))) { /* * The block in question was not found * in the cache; i.e., the block that * getblk() returned is empty. So, we * can (and should) copy in the * contents, because we've already * determined that this was the right * version of this block on disk. * * And, it can't have changed underneath * us, because we have the segment lock. */ error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size); if (error) goto err2; } } if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0) goto err2; nblkwritten++; /* * XXX should account indirect blocks and ifile pages as well */ if (nblkwritten + lfs_lblkno(fs, ninowritten * sizeof (struct ulfs1_dinode)) > LFS_MARKV_MAX_BLOCKS) { DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n", nblkwritten, ninowritten)); lfs_segwrite(mntp, SEGM_CLEAN); nblkwritten = ninowritten = 0; } } /* * Finish the old file, if there was one */ if (v_daddr != LFS_UNUSED_DADDR) { lfs_vunref(vp); numrefed--; } #ifdef DIAGNOSTIC if (numrefed != 0) panic("lfs_markv: numrefed=%d", numrefed); #endif DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n", nblkwritten, ninowritten)); /* * The last write has to be SEGM_SYNC, because of calling semantics. * It also has to be SEGM_CKP, because otherwise we could write * over the newly cleaned data contained in a checkpoint, and then * we'd be unhappy at recovery time. */ lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); lfs_segunlock(fs); vfs_unbusy(mntp, false, NULL); if (error) return (error); else if (do_again) return EAGAIN; return 0; err2: DLOG((DLOG_CLEAN, "lfs_markv err2\n")); /* * XXX we're here because copyin() failed. * XXX it means that we can't trust the cleanerd. too bad. * XXX how can we recover from this? */ err3: /* * XXX should do segwrite here anyway? */ if (v_daddr != LFS_UNUSED_DADDR) { lfs_vunref(vp); --numrefed; } lfs_segunlock(fs); vfs_unbusy(mntp, false, NULL); #ifdef DIAGNOSTIC if (numrefed != 0) panic("lfs_markv: numrefed=%d", numrefed); #endif return (error); }
static int ffs_rawread_readahead(struct vnode *vp, caddr_t udata, off_t loffset, size_t len, struct buf *bp) { int error; int iolen; int blockoff; int bsize; struct vnode *dp; int bforwards; bsize = vp->v_mount->mnt_stat.f_iosize; /* * Make sure it fits into the pbuf */ iolen = (int)(intptr_t)udata & PAGE_MASK; if (len + iolen > bp->b_kvasize) { len = bp->b_kvasize; if (iolen != 0) len -= PAGE_SIZE; } /* * Raw disk address is in bio2, but we wait for it to * chain to bio1. */ bp->b_flags &= ~B_ERROR; bp->b_loffset = loffset; bp->b_bio2.bio_offset = NOOFFSET; bp->b_bio1.bio_done = biodone_sync; bp->b_bio1.bio_flags |= BIO_SYNC; blockoff = (loffset % bsize) / DEV_BSIZE; error = VOP_BMAP(vp, bp->b_loffset, &bp->b_bio2.bio_offset, &bforwards, NULL, BUF_CMD_READ); if (error != 0) return error; dp = VTOI(vp)->i_devvp; if (bp->b_bio2.bio_offset == NOOFFSET) { /* * Fill holes with NULs to preserve semantics */ if (len + blockoff * DEV_BSIZE > bsize) len = bsize - blockoff * DEV_BSIZE; if (vmapbuf(bp, udata, len) < 0) return EFAULT; lwkt_user_yield(); bzero(bp->b_data, bp->b_bcount); /* Mark operation completed (similar to bufdone()) */ bp->b_resid = 0; return 0; } if (len + blockoff * DEV_BSIZE > bforwards) len = bforwards - blockoff * DEV_BSIZE; bp->b_bio2.bio_offset += blockoff * DEV_BSIZE; if (vmapbuf(bp, udata, len) < 0) return EFAULT; /* * Access the block device layer using the device vnode (dp) and * the translated block number (bio2) instead of the logical block * number (bio1). * * Even though we are bypassing the vnode layer, we still * want the vnode state to indicate that an I/O on its behalf * is in progress. */ bp->b_cmd = BUF_CMD_READ; bio_start_transaction(&bp->b_bio1, &vp->v_track_read); vn_strategy(dp, &bp->b_bio2); return 0; }
int bread_cluster(struct vnode *vp, daddr_t blkno, int size, struct buf **rbpp) { struct buf *bp, **xbpp; int howmany, maxra, i, inc; daddr_t sblkno; *rbpp = bio_doread(vp, blkno, size, 0); if (size != round_page(size)) goto out; if (VOP_BMAP(vp, blkno + 1, NULL, &sblkno, &maxra)) goto out; maxra++; if (sblkno == -1 || maxra < 2) goto out; howmany = MAXPHYS / size; if (howmany > maxra) howmany = maxra; xbpp = malloc((howmany + 1) * sizeof(struct buf *), M_TEMP, M_NOWAIT); if (xbpp == NULL) goto out; for (i = howmany - 1; i >= 0; i--) { size_t sz; /* * First buffer allocates big enough size to cover what * all the other buffers need. */ sz = i == 0 ? howmany * size : 0; xbpp[i] = buf_get(vp, blkno + i + 1, sz); if (xbpp[i] == NULL) { for (++i; i < howmany; i++) { SET(xbpp[i]->b_flags, B_INVAL); brelse(xbpp[i]); } free(xbpp, M_TEMP); goto out; } } bp = xbpp[0]; xbpp[howmany] = 0; inc = btodb(size); for (i = 1; i < howmany; i++) { bcstats.pendingreads++; bcstats.numreads++; SET(xbpp[i]->b_flags, B_READ | B_ASYNC); xbpp[i]->b_blkno = sblkno + (i * inc); xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size; xbpp[i]->b_data = NULL; xbpp[i]->b_pobj = bp->b_pobj; xbpp[i]->b_poffs = bp->b_poffs + (i * size); } KASSERT(bp->b_lblkno == blkno + 1); KASSERT(bp->b_vp == vp); bp->b_blkno = sblkno; SET(bp->b_flags, B_READ | B_ASYNC | B_CALL); bp->b_saveaddr = (void *)xbpp; bp->b_iodone = bread_cluster_callback; bcstats.pendingreads++; bcstats.numreads++; VOP_STRATEGY(bp); curproc->p_ru.ru_inblock++; out: return (biowait(*rbpp)); }
/* * This is a slightly strangely structured routine. It always puts * all the pages for a vnode. It starts by releasing pages which * are clean and simultaneously looks up the smallest offset for a * dirty page beloning to the object. If there is no smallest offset, * all pages have been cleaned. Otherwise, it finds a contiguous range * of dirty pages starting from the smallest offset and writes them out. * After this the scan is restarted. */ int genfs_do_putpages(struct vnode *vp, off_t startoff, off_t endoff, int flags, struct vm_page **busypg) { char databuf[MAXPHYS]; struct uvm_object *uobj = &vp->v_uobj; struct vm_page *pg, *pg_next; voff_t smallest; voff_t curoff, bufoff; off_t eof; size_t xfersize; int bshift = vp->v_mount->mnt_fs_bshift; int bsize = 1 << bshift; #if 0 int async = (flags & PGO_SYNCIO) == 0; #else int async = 0; #endif restart: /* check if all pages are clean */ smallest = -1; for (pg = TAILQ_FIRST(&uobj->memq); pg; pg = pg_next) { pg_next = TAILQ_NEXT(pg, listq.queue); /* * XXX: this is not correct at all. But it's based on * assumptions we can make when accessing the pages * only through the file system and not through the * virtual memory subsystem. Well, at least I hope * so ;) */ KASSERT((pg->flags & PG_BUSY) == 0); /* If we can just dump the page, do so */ if (pg->flags & PG_CLEAN || flags & PGO_FREE) { uvm_pagefree(pg); continue; } if (pg->offset < smallest || smallest == -1) smallest = pg->offset; } /* all done? */ if (TAILQ_EMPTY(&uobj->memq)) { vp->v_iflag &= ~VI_ONWORKLST; mutex_exit(&uobj->vmobjlock); return 0; } /* we need to flush */ GOP_SIZE(vp, vp->v_writesize, &eof, 0); for (curoff = smallest; curoff < eof; curoff += PAGE_SIZE) { void *curva; if (curoff - smallest >= MAXPHYS) break; pg = uvm_pagelookup(uobj, curoff); if (pg == NULL) break; /* XXX: see comment about above KASSERT */ KASSERT((pg->flags & PG_BUSY) == 0); curva = databuf + (curoff-smallest); memcpy(curva, (void *)pg->uanon, PAGE_SIZE); rumpvm_enterva((vaddr_t)curva, pg); pg->flags |= PG_CLEAN; } KASSERT(curoff > smallest); mutex_exit(&uobj->vmobjlock); /* then we write */ for (bufoff = 0; bufoff < MIN(curoff-smallest,eof); bufoff+=xfersize) { struct buf *bp; struct vnode *devvp; daddr_t bn, lbn; int run, error; lbn = (smallest + bufoff) >> bshift; error = VOP_BMAP(vp, lbn, &devvp, &bn, &run); if (error) panic("%s: VOP_BMAP failed: %d", __func__, error); xfersize = MIN(((lbn+1+run) << bshift) - (smallest+bufoff), curoff - (smallest+bufoff)); /* * We might run across blocks which aren't allocated yet. * A reason might be e.g. the write operation being still * in the kernel page cache while truncate has already * enlarged the file. So just ignore those ranges. */ if (bn == -1) continue; bp = getiobuf(vp, true); /* only write max what we are allowed to write */ bp->b_bcount = xfersize; if (smallest + bufoff + xfersize > eof) bp->b_bcount -= (smallest+bufoff+xfersize) - eof; bp->b_bcount = (bp->b_bcount + DEV_BSIZE-1) & ~(DEV_BSIZE-1); KASSERT(bp->b_bcount > 0); KASSERT(smallest >= 0); DPRINTF(("putpages writing from %x to %x (vp size %x)\n", (int)(smallest + bufoff), (int)(smallest + bufoff + bp->b_bcount), (int)eof)); bp->b_bufsize = round_page(bp->b_bcount); bp->b_lblkno = 0; bp->b_blkno = bn + (((smallest+bufoff)&(bsize-1))>>DEV_BSHIFT); bp->b_data = databuf + bufoff; bp->b_flags = B_WRITE; bp->b_cflags |= BC_BUSY; if (async) { bp->b_flags |= B_ASYNC; bp->b_iodone = uvm_aio_biodone; } vp->v_numoutput++; VOP_STRATEGY(devvp, bp); if (bp->b_error) panic("%s: VOP_STRATEGY lazy bum %d", __func__, bp->b_error); if (!async) putiobuf(bp); } rumpvm_flushva(); mutex_enter(&uobj->vmobjlock); goto restart; }
/* * miscfs/genfs getpages routine. This is a fair bit simpler than the * kernel counterpart since we're not being executed from a fault handler * and generally don't need to care about PGO_LOCKED or other cruft. * We do, however, need to care about page locking and we keep trying until * we get all the pages within the range. The object locking protocol * is the same as for the kernel: enter with the object lock held, * return with it released. */ int genfs_getpages(void *v) { struct vop_getpages_args /* { struct vnode *a_vp; voff_t a_offset; struct vm_page **a_m; int *a_count; int a_centeridx; vm_prot_t a_access_type; int a_advice; int a_flags; } */ *ap = v; struct vnode *vp = ap->a_vp; struct uvm_object *uobj = (struct uvm_object *)vp; struct vm_page *pg; voff_t curoff, endoff; off_t diskeof; size_t bufsize, remain, bufoff, xfersize; uint8_t *tmpbuf; int bshift = vp->v_mount->mnt_fs_bshift; int bsize = 1<<bshift; int count = *ap->a_count; int async; int i, error; /* * Ignore async for now, the structure of this routine * doesn't exactly allow for it ... */ async = 0; if (ap->a_centeridx != 0) panic("%s: centeridx != not supported", __func__); if (ap->a_access_type & VM_PROT_WRITE) vp->v_iflag |= VI_ONWORKLST; curoff = ap->a_offset & ~PAGE_MASK; for (i = 0; i < count; i++, curoff += PAGE_SIZE) { retrylookup: pg = uvm_pagelookup(uobj, curoff); if (pg == NULL) break; /* page is busy? we need to wait until it's released */ if (pg->flags & PG_BUSY) { pg->flags |= PG_WANTED; UVM_UNLOCK_AND_WAIT(pg, &uobj->vmobjlock, 0, "getpg",0); mutex_enter(&uobj->vmobjlock); goto retrylookup; } pg->flags |= PG_BUSY; if (pg->flags & PG_FAKE) break; ap->a_m[i] = pg; } /* got everything? if so, just return */ if (i == count) { mutex_exit(&uobj->vmobjlock); return 0; } /* * didn't? Ok, allocate backing pages. Start from the first * one we missed. */ for (; i < count; i++, curoff += PAGE_SIZE) { retrylookup2: pg = uvm_pagelookup(uobj, curoff); /* found? busy it and be happy */ if (pg) { if (pg->flags & PG_BUSY) { pg->flags = PG_WANTED; UVM_UNLOCK_AND_WAIT(pg, &uobj->vmobjlock, 0, "getpg2", 0); mutex_enter(&uobj->vmobjlock); goto retrylookup2; } else { pg->flags |= PG_BUSY; } /* not found? make a new page */ } else { pg = rumpvm_makepage(uobj, curoff); } ap->a_m[i] = pg; } /* * We have done all the clerical work and have all pages busied. * Release the vm object for other consumers. */ mutex_exit(&uobj->vmobjlock); /* * Now, we have all the pages here & busy. Transfer the range * starting from the missing offset and transfer into the * page buffers. */ GOP_SIZE(vp, vp->v_size, &diskeof, 0); /* align to boundaries */ endoff = trunc_page(ap->a_offset) + (count << PAGE_SHIFT); endoff = MIN(endoff, ((vp->v_writesize+bsize-1) & ~(bsize-1))); curoff = ap->a_offset & ~(MAX(bsize,PAGE_SIZE)-1); remain = endoff - curoff; if (diskeof > curoff) remain = MIN(remain, diskeof - curoff); DPRINTF(("a_offset: %llx, startoff: 0x%llx, endoff 0x%llx\n", (unsigned long long)ap->a_offset, (unsigned long long)curoff, (unsigned long long)endoff)); /* read everything into a buffer */ bufsize = round_page(remain); tmpbuf = kmem_zalloc(bufsize, KM_SLEEP); for (bufoff = 0; remain; remain -= xfersize, bufoff+=xfersize) { struct buf *bp; struct vnode *devvp; daddr_t lbn, bn; int run; lbn = (curoff + bufoff) >> bshift; /* XXX: assume eof */ error = VOP_BMAP(vp, lbn, &devvp, &bn, &run); if (error) panic("%s: VOP_BMAP & lazy bum: %d", __func__, error); DPRINTF(("lbn %d (off %d) -> bn %d run %d\n", (int)lbn, (int)(curoff+bufoff), (int)bn, run)); xfersize = MIN(((lbn+1+run)<<bshift)-(curoff+bufoff), remain); /* hole? */ if (bn == -1) { memset(tmpbuf + bufoff, 0, xfersize); continue; } bp = getiobuf(vp, true); bp->b_data = tmpbuf + bufoff; bp->b_bcount = xfersize; bp->b_blkno = bn; bp->b_lblkno = 0; bp->b_flags = B_READ; bp->b_cflags = BC_BUSY; if (async) { bp->b_flags |= B_ASYNC; bp->b_iodone = uvm_aio_biodone; } VOP_STRATEGY(devvp, bp); if (bp->b_error) panic("%s: VOP_STRATEGY, lazy bum", __func__); if (!async) putiobuf(bp); } /* skip to beginning of pages we're interested in */ bufoff = 0; while (round_page(curoff + bufoff) < trunc_page(ap->a_offset)) bufoff += PAGE_SIZE; DPRINTF(("first page offset 0x%x\n", (int)(curoff + bufoff))); for (i = 0; i < count; i++, bufoff += PAGE_SIZE) { /* past our prime? */ if (curoff + bufoff >= endoff) break; pg = uvm_pagelookup(&vp->v_uobj, curoff + bufoff); KASSERT(pg); DPRINTF(("got page %p (off 0x%x)\n", pg, (int)(curoff+bufoff))); if (pg->flags & PG_FAKE) { memcpy((void *)pg->uanon, tmpbuf+bufoff, PAGE_SIZE); pg->flags &= ~PG_FAKE; pg->flags |= PG_CLEAN; } ap->a_m[i] = pg; } *ap->a_count = i; kmem_free(tmpbuf, bufsize); return 0; }
/* * Do clustered write for FFS. * * Three cases: * 1. Write is not sequential (write asynchronously) * Write is sequential: * 2. beginning of cluster - begin cluster * 3. middle of a cluster - add to cluster * 4. end of a cluster - asynchronously write cluster */ void cluster_write(struct buf *bp, struct cluster_info *ci, u_quad_t filesize) { struct vnode *vp; daddr64_t lbn; int maxclen, cursize; vp = bp->b_vp; lbn = bp->b_lblkno; /* Initialize vnode to beginning of file. */ if (lbn == 0) ci->ci_lasta = ci->ci_clen = ci->ci_cstart = ci->ci_lastw = 0; if (ci->ci_clen == 0 || lbn != ci->ci_lastw + 1 || (bp->b_blkno != ci->ci_lasta + btodb(bp->b_bcount))) { maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; if (ci->ci_clen != 0) { /* * Next block is not sequential. * * If we are not writing at end of file, the process * seeked to another point in the file since its * last write, or we have reached our maximum * cluster size, then push the previous cluster. * Otherwise try reallocating to make it sequential. */ cursize = ci->ci_lastw - ci->ci_cstart + 1; if (((u_quad_t)(lbn + 1)) * bp->b_bcount != filesize || lbn != ci->ci_lastw + 1 || ci->ci_clen <= cursize) { cluster_wbuild(vp, NULL, bp->b_bcount, ci->ci_cstart, cursize, lbn); } else { struct buf **bpp, **endbp; struct cluster_save *buflist; buflist = cluster_collectbufs(vp, ci, bp); endbp = &buflist->bs_children [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { /* * Failed, push the previous cluster. */ for (bpp = buflist->bs_children; bpp < endbp; bpp++) brelse(*bpp); free(buflist, M_VCLUSTER); cluster_wbuild(vp, NULL, bp->b_bcount, ci->ci_cstart, cursize, lbn); } else { /* * Succeeded, keep building cluster. */ for (bpp = buflist->bs_children; bpp <= endbp; bpp++) bdwrite(*bpp); free(buflist, M_VCLUSTER); ci->ci_lastw = lbn; ci->ci_lasta = bp->b_blkno; return; } } } /* * Consider beginning a cluster. * If at end of file, make cluster as large as possible, * otherwise find size of existing cluster. */ if ((u_quad_t)(lbn + 1) * (u_quad_t)bp->b_bcount != filesize && (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || bp->b_blkno == -1)) { bawrite(bp); ci->ci_clen = 0; ci->ci_lasta = bp->b_blkno; ci->ci_cstart = lbn + 1; ci->ci_lastw = lbn; return; } ci->ci_clen = maxclen; if (maxclen == 0) { /* I/O not contiguous */ ci->ci_cstart = lbn + 1; bawrite(bp); } else { /* Wait for rest of cluster */ ci->ci_cstart = lbn; bdwrite(bp); } } else if (lbn == ci->ci_cstart + ci->ci_clen) { /* * At end of cluster, write it out. */ cluster_wbuild(vp, bp, bp->b_bcount, ci->ci_cstart, ci->ci_clen + 1, lbn); ci->ci_clen = 0; ci->ci_cstart = lbn + 1; } else /* * In the middle of a cluster, so just delay the * I/O for now. */ bdwrite(bp); ci->ci_lastw = lbn; ci->ci_lasta = bp->b_blkno; }
/* * small block filesystem vnode pager input */ static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m) { struct vnode *vp; struct bufobj *bo; struct buf *bp; struct sf_buf *sf; daddr_t fileaddr; vm_offset_t bsize; vm_page_bits_t bits; int error, i; error = 0; vp = object->handle; if (vp->v_iflag & VI_DOOMED) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; VOP_BMAP(vp, 0, &bo, 0, NULL, NULL); sf = sf_buf_alloc(m, 0); for (i = 0; i < PAGE_SIZE / bsize; i++) { vm_ooffset_t address; bits = vm_page_bits(i * bsize, bsize); if (m->valid & bits) continue; address = IDX_TO_OFF(m->pindex) + i * bsize; if (address >= object->un_pager.vnp.vnp_size) { fileaddr = -1; } else { error = vnode_pager_addr(vp, address, &fileaddr, NULL); if (error) break; } if (fileaddr != -1) { bp = getpbuf(&vnode_pbuf_freecnt); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; bp->b_iodone = bdone; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize; bp->b_blkno = fileaddr; pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bsize; bp->b_bufsize = bsize; bp->b_runningbufspace = bp->b_bufsize; atomic_add_long(&runningbufspace, bp->b_runningbufspace); /* do the input */ bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); bwait(bp, PVM, "vnsrd"); if ((bp->b_ioflags & BIO_ERROR) != 0) error = EIO; /* * free the buffer header back to the swap buffer pool */ bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_pbuf_freecnt); if (error) break; } else bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize); KASSERT((m->dirty & bits) == 0, ("vnode_pager_input_smlfs: page %p is dirty", m)); VM_OBJECT_WLOCK(object); m->valid |= bits; VM_OBJECT_WUNLOCK(object); } sf_buf_free(sf); if (error) { return VM_PAGER_ERROR; } return VM_PAGER_OK; }
/* * File table vnode ioctl routine. */ static int vn_ioctl(file_t *fp, u_long com, void *data) { struct vnode *vp = fp->f_data, *ovp; struct vattr vattr; int error; switch (vp->v_type) { case VREG: case VDIR: if (com == FIONREAD) { vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, kauth_cred_get()); VOP_UNLOCK(vp); if (error) return (error); *(int *)data = vattr.va_size - fp->f_offset; return (0); } if ((com == FIONWRITE) || (com == FIONSPACE)) { /* * Files don't have send queues, so there never * are any bytes in them, nor is there any * open space in them. */ *(int *)data = 0; return (0); } if (com == FIOGETBMAP) { daddr_t *block; if (*(daddr_t *)data < 0) return (EINVAL); block = (daddr_t *)data; return (VOP_BMAP(vp, *block, NULL, block, NULL)); } if (com == OFIOGETBMAP) { daddr_t ibn, obn; if (*(int32_t *)data < 0) return (EINVAL); ibn = (daddr_t)*(int32_t *)data; error = VOP_BMAP(vp, ibn, NULL, &obn, NULL); *(int32_t *)data = (int32_t)obn; return error; } if (com == FIONBIO || com == FIOASYNC) /* XXX */ return (0); /* XXX */ /* fall into ... */ case VFIFO: case VCHR: case VBLK: error = VOP_IOCTL(vp, com, data, fp->f_flag, kauth_cred_get()); if (error == 0 && com == TIOCSCTTY) { vref(vp); mutex_enter(proc_lock); ovp = curproc->p_session->s_ttyvp; curproc->p_session->s_ttyvp = vp; mutex_exit(proc_lock); if (ovp != NULL) vrele(ovp); } return (error); default: return (EPASSTHROUGH); } }
/* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_GETPAGES. */ int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg) { vm_object_t object; struct bufobj *bo; struct buf *bp; off_t foff; #ifdef INVARIANTS off_t blkno0; #endif int bsize, pagesperblock, *freecnt; int error, before, after, rbehind, rahead, poff, i; int bytecount, secmask; KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("%s does not support devices", __func__)); if (vp->v_iflag & VI_DOOMED) return (VM_PAGER_BAD); object = vp->v_object; foff = IDX_TO_OFF(m[0]->pindex); bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; KASSERT(foff < object->un_pager.vnp.vnp_size, ("%s: page %p offset beyond vp %p size", __func__, m[0], vp)); KASSERT(count <= sizeof(bp->b_pages), ("%s: requested %d pages", __func__, count)); /* * The last page has valid blocks. Invalid part can only * exist at the end of file, and the page is made fully valid * by zeroing in vm_pager_get_pages(). */ if (m[count - 1]->valid != 0 && --count == 0) { if (iodone != NULL) iodone(arg, m, 1, 0); return (VM_PAGER_OK); } /* * Synchronous and asynchronous paging operations use different * free pbuf counters. This is done to avoid asynchronous requests * to consume all pbufs. * Allocate the pbuf at the very beginning of the function, so that * if we are low on certain kind of pbufs don't even proceed to BMAP, * but sleep. */ freecnt = iodone != NULL ? &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt; bp = getpbuf(freecnt); /* * Get the underlying device blocks for the file with VOP_BMAP(). * If the file system doesn't support VOP_BMAP, use old way of * getting pages via VOP_READ. */ error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before); if (error == EOPNOTSUPP) { relpbuf(bp, freecnt); VM_OBJECT_WLOCK(object); for (i = 0; i < count; i++) { PCPU_INC(cnt.v_vnodein); PCPU_INC(cnt.v_vnodepgsin); error = vnode_pager_input_old(object, m[i]); if (error) break; } VM_OBJECT_WUNLOCK(object); return (error); } else if (error != 0) { relpbuf(bp, freecnt); return (VM_PAGER_ERROR); } /* * If the file system supports BMAP, but blocksize is smaller * than a page size, then use special small filesystem code. */ if (pagesperblock == 0) { relpbuf(bp, freecnt); for (i = 0; i < count; i++) { PCPU_INC(cnt.v_vnodein); PCPU_INC(cnt.v_vnodepgsin); error = vnode_pager_input_smlfs(object, m[i]); if (error) break; } return (error); } /* * A sparse file can be encountered only for a single page request, * which may not be preceded by call to vm_pager_haspage(). */ if (bp->b_blkno == -1) { KASSERT(count == 1, ("%s: array[%d] request to a sparse file %p", __func__, count, vp)); relpbuf(bp, freecnt); pmap_zero_page(m[0]); KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty", __func__, m[0])); VM_OBJECT_WLOCK(object); m[0]->valid = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(object); return (VM_PAGER_OK); } #ifdef INVARIANTS blkno0 = bp->b_blkno; #endif bp->b_blkno += (foff % bsize) / DEV_BSIZE; /* Recalculate blocks available after/before to pages. */ poff = (foff % bsize) / PAGE_SIZE; before *= pagesperblock; before += poff; after *= pagesperblock; after += pagesperblock - (poff + 1); if (m[0]->pindex + after >= object->size) after = object->size - 1 - m[0]->pindex; KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d", __func__, count, after + 1)); after -= count - 1; /* Trim requested rbehind/rahead to possible values. */ rbehind = a_rbehind ? *a_rbehind : 0; rahead = a_rahead ? *a_rahead : 0; rbehind = min(rbehind, before); rbehind = min(rbehind, m[0]->pindex); rahead = min(rahead, after); rahead = min(rahead, object->size - m[count - 1]->pindex); /* * Check that total amount of pages fit into buf. Trim rbehind and * rahead evenly if not. */ if (rbehind + rahead + count > nitems(bp->b_pages)) { int trim, sum; trim = rbehind + rahead + count - nitems(bp->b_pages) + 1; sum = rbehind + rahead; if (rbehind == before) { /* Roundup rbehind trim to block size. */ rbehind -= roundup(trim * rbehind / sum, pagesperblock); if (rbehind < 0) rbehind = 0; } else rbehind -= trim * rbehind / sum; rahead -= trim * rahead / sum; } KASSERT(rbehind + rahead + count <= nitems(bp->b_pages), ("%s: behind %d ahead %d count %d", __func__, rbehind, rahead, count)); /* * Fill in the bp->b_pages[] array with requested and optional * read behind or read ahead pages. Read behind pages are looked * up in a backward direction, down to a first cached page. Same * for read ahead pages, but there is no need to shift the array * in case of encountering a cached page. */ i = bp->b_npages = 0; if (rbehind) { vm_pindex_t startpindex, tpindex; vm_page_t p; VM_OBJECT_WLOCK(object); startpindex = m[0]->pindex - rbehind; if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL && p->pindex >= startpindex) startpindex = p->pindex + 1; /* tpindex is unsigned; beware of numeric underflow. */ for (tpindex = m[0]->pindex - 1; tpindex >= startpindex && tpindex < m[0]->pindex; tpindex--, i++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) { /* Shift the array. */ for (int j = 0; j < i; j++) bp->b_pages[j] = bp->b_pages[j + tpindex + 1 - startpindex]; break; } bp->b_pages[tpindex - startpindex] = p; } bp->b_pgbefore = i; bp->b_npages += i; bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE; } else bp->b_pgbefore = 0; /* Requested pages. */ for (int j = 0; j < count; j++, i++) bp->b_pages[i] = m[j]; bp->b_npages += count; if (rahead) { vm_pindex_t endpindex, tpindex; vm_page_t p; if (!VM_OBJECT_WOWNED(object)) VM_OBJECT_WLOCK(object); endpindex = m[count - 1]->pindex + rahead + 1; if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL && p->pindex < endpindex) endpindex = p->pindex; if (endpindex > object->size) endpindex = object->size; for (tpindex = m[count - 1]->pindex + 1; tpindex < endpindex; i++, tpindex++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) break; bp->b_pages[i] = p; } bp->b_pgafter = i - bp->b_npages; bp->b_npages = i; } else bp->b_pgafter = 0; if (VM_OBJECT_WOWNED(object)) VM_OBJECT_WUNLOCK(object); /* Report back actual behind/ahead read. */ if (a_rbehind) *a_rbehind = bp->b_pgbefore; if (a_rahead) *a_rahead = bp->b_pgafter; #ifdef INVARIANTS KASSERT(bp->b_npages <= nitems(bp->b_pages), ("%s: buf %p overflowed", __func__, bp)); for (int j = 1; j < bp->b_npages; j++) KASSERT(bp->b_pages[j]->pindex - 1 == bp->b_pages[j - 1]->pindex, ("%s: pages array not consecutive, bp %p", __func__, bp)); #endif /* * Recalculate first offset and bytecount with regards to read behind. * Truncate bytecount to vnode real size and round up physical size * for real devices. */ foff = IDX_TO_OFF(bp->b_pages[0]->pindex); bytecount = bp->b_npages << PAGE_SHIFT; if ((foff + bytecount) > object->un_pager.vnp.vnp_size) bytecount = object->un_pager.vnp.vnp_size - foff; secmask = bo->bo_bsize - 1; KASSERT(secmask < PAGE_SIZE && secmask > 0, ("%s: sector size %d too large", __func__, secmask + 1)); bytecount = (bytecount + secmask) & ~secmask; /* * And map the pages to be read into the kva, if the filesystem * requires mapped buffers. */ if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } /* Build a minimal buffer header. */ bp->b_iocmd = BIO_READ; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount; bp->b_iooffset = dbtob(bp->b_blkno); KASSERT(IDX_TO_OFF(m[0]->pindex - bp->b_pages[0]->pindex) == (blkno0 - bp->b_blkno) * DEV_BSIZE + IDX_TO_OFF(m[0]->pindex) % bsize, ("wrong offsets bsize %d m[0] %ju b_pages[0] %ju " "blkno0 %ju b_blkno %ju", bsize, (uintmax_t)m[0]->pindex, (uintmax_t)bp->b_pages[0]->pindex, (uintmax_t)blkno0, (uintmax_t)bp->b_blkno)); atomic_add_long(&runningbufspace, bp->b_runningbufspace); PCPU_INC(cnt.v_vnodein); PCPU_ADD(cnt.v_vnodepgsin, bp->b_npages); if (iodone != NULL) { /* async */ bp->b_pgiodone = iodone; bp->b_caller1 = arg; bp->b_iodone = vnode_pager_generic_getpages_done_async; bp->b_flags |= B_ASYNC; BUF_KERNPROC(bp); bstrategy(bp); return (VM_PAGER_OK); } else { bp->b_iodone = bdone; bstrategy(bp); bwait(bp, PVM, "vnread"); error = vnode_pager_generic_getpages_done(bp); for (i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_pbuf_freecnt); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } }
int lfs_bmapv(struct lwp *l, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt) { BLOCK_INFO *blkp; IFILE *ifp; struct buf *bp; struct inode *ip = NULL; struct lfs *fs; struct mount *mntp; struct ulfsmount *ump; struct vnode *vp; ino_t lastino; daddr_t v_daddr; int cnt, error; int numrefed = 0; error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, KAUTH_REQ_SYSTEM_LFS_BMAPV, NULL, NULL, NULL); if (error) return (error); if ((mntp = vfs_getvfs(fsidp)) == NULL) return (ENOENT); if ((error = vfs_busy(mntp, NULL)) != 0) return (error); ump = VFSTOULFS(mntp); fs = ump->um_lfs; if (fs->lfs_cleaner_thread == NULL) fs->lfs_cleaner_thread = curlwp; KASSERT(fs->lfs_cleaner_thread == curlwp); cnt = blkcnt; error = 0; /* these were inside the initialization for the for loop */ vp = NULL; v_daddr = LFS_UNUSED_DADDR; lastino = LFS_UNUSED_INUM; for (blkp = blkiov; cnt--; ++blkp) { /* * Get the IFILE entry (only once) and see if the file still * exists. */ if (lastino != blkp->bi_inode) { /* * Finish the old file, if there was one. */ if (vp != NULL) { vput(vp); vp = NULL; numrefed--; } /* * Start a new file */ lastino = blkp->bi_inode; if (blkp->bi_inode == LFS_IFILE_INUM) v_daddr = lfs_sb_getidaddr(fs); else { LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); v_daddr = lfs_if_getdaddr(fs, ifp); brelse(bp, 0); } if (v_daddr == LFS_UNUSED_DADDR) { blkp->bi_daddr = LFS_UNUSED_DADDR; continue; } error = lfs_fastvget(mntp, blkp->bi_inode, NULL, LK_SHARED, &vp); if (error) { DLOG((DLOG_CLEAN, "lfs_bmapv: lfs_fastvget ino" "%d failed with %d", blkp->bi_inode,error)); KASSERT(vp == NULL); continue; } else { KASSERT(VOP_ISLOCKED(vp)); numrefed++; } ip = VTOI(vp); } else if (vp == NULL) { /* * This can only happen if the vnode is dead. * Keep going. Note that we DO NOT set the * bi_addr to anything -- if we failed to get * the vnode, for example, we want to assume * conservatively that all of its blocks *are* * located in the segment in question. * lfs_markv will throw them out if we are * wrong. */ continue; } /* Past this point we are guaranteed that vp, ip are valid. */ if (blkp->bi_lbn == LFS_UNUSED_LBN) { /* * We just want the inode address, which is * conveniently in v_daddr. */ blkp->bi_daddr = v_daddr; } else { daddr_t bi_daddr; error = VOP_BMAP(vp, blkp->bi_lbn, NULL, &bi_daddr, NULL); if (error) { blkp->bi_daddr = LFS_UNUSED_DADDR; continue; } blkp->bi_daddr = LFS_DBTOFSB(fs, bi_daddr); /* Fill in the block size, too */ if (blkp->bi_lbn >= 0) blkp->bi_size = lfs_blksize(fs, ip, blkp->bi_lbn); else blkp->bi_size = lfs_sb_getbsize(fs); } } /* * Finish the old file, if there was one. */ if (vp != NULL) { vput(vp); vp = NULL; numrefed--; } #ifdef DIAGNOSTIC if (numrefed != 0) panic("lfs_bmapv: numrefed=%d", numrefed); #endif vfs_unbusy(mntp, false, NULL); return 0; }
/* * Read data to a buf, including read-ahead if we find this to be beneficial. * cluster_read replaces bread. */ int cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, struct ucred *cred, long totread, int seqcount, int gbflags, struct buf **bpp) { struct buf *bp, *rbp, *reqbp; struct bufobj *bo; daddr_t blkno, origblkno; int maxra, racluster; int error, ncontig; int i; error = 0; bo = &vp->v_bufobj; if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; /* * Try to limit the amount of read-ahead by a few * ad-hoc parameters. This needs work!!! */ racluster = vp->v_mount->mnt_iosize_max / size; maxra = seqcount; maxra = min(read_max, maxra); maxra = min(nbuf/8, maxra); if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) maxra = (filesize / size) - lblkno; /* * get the requested block */ *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags); if (bp == NULL) return (EBUSY); origblkno = lblkno; /* * if it is in the cache, then check to see if the reads have been * sequential. If they have, then try some read-ahead, otherwise * back-off on prospective read-aheads. */ if (bp->b_flags & B_CACHE) { if (!seqcount) { return 0; } else if ((bp->b_flags & B_RAM) == 0) { return 0; } else { bp->b_flags &= ~B_RAM; BO_RLOCK(bo); for (i = 1; i < maxra; i++) { /* * Stop if the buffer does not exist or it * is invalid (about to go away?) */ rbp = gbincore(&vp->v_bufobj, lblkno+i); if (rbp == NULL || (rbp->b_flags & B_INVAL)) break; /* * Set another read-ahead mark so we know * to check again. (If we can lock the * buffer without waiting) */ if ((((i % racluster) == (racluster - 1)) || (i == (maxra - 1))) && (0 == BUF_LOCK(rbp, LK_EXCLUSIVE | LK_NOWAIT, NULL))) { rbp->b_flags |= B_RAM; BUF_UNLOCK(rbp); } } BO_RUNLOCK(bo); if (i >= maxra) { return 0; } lblkno += i; } reqbp = bp = NULL; /* * If it isn't in the cache, then get a chunk from * disk if sequential, otherwise just get the block. */ } else { off_t firstread = bp->b_offset; int nblks; long minread; KASSERT(bp->b_offset != NOOFFSET, ("cluster_read: no buffer offset")); ncontig = 0; /* * Adjust totread if needed */ minread = read_min * size; if (minread > totread) totread = minread; /* * Compute the total number of blocks that we should read * synchronously. */ if (firstread + totread > filesize) totread = filesize - firstread; nblks = howmany(totread, size); if (nblks > racluster) nblks = racluster; /* * Now compute the number of contiguous blocks. */ if (nblks > 1) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); /* * If this failed to map just do the original block. */ if (error || blkno == -1) ncontig = 0; } /* * If we have contiguous data available do a cluster * otherwise just read the requested block. */ if (ncontig) { /* Account for our first block. */ ncontig = min(ncontig + 1, nblks); if (ncontig < nblks) nblks = ncontig; bp = cluster_rbuild(vp, filesize, lblkno, blkno, size, nblks, gbflags, bp); lblkno += (bp->b_bufsize / size); } else { bp->b_flags |= B_RAM; bp->b_iocmd = BIO_READ; lblkno += 1; } } /* * handle the synchronous read so that it is available ASAP. */ if (bp) { if ((bp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(bp, 0); } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } /* * If we have been doing sequential I/O, then do some read-ahead. */ while (lblkno < (origblkno + maxra)) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); if (error) break; if (blkno == -1) break; /* * We could throttle ncontig here by maxra but we might as * well read the data if it is contiguous. We're throttled * by racluster anyway. */ if (ncontig) { ncontig = min(ncontig + 1, racluster); rbp = cluster_rbuild(vp, filesize, lblkno, blkno, size, ncontig, gbflags, NULL); lblkno += (rbp->b_bufsize / size); if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } } else { rbp = getblk(vp, lblkno, size, 0, 0, gbflags); lblkno += 1; if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } rbp->b_flags |= B_ASYNC | B_RAM; rbp->b_iocmd = BIO_READ; rbp->b_blkno = blkno; } if (rbp->b_flags & B_CACHE) { rbp->b_flags &= ~B_ASYNC; bqrelse(rbp); continue; } if ((rbp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(rbp, 0); } rbp->b_flags &= ~B_INVAL; rbp->b_ioflags &= ~BIO_ERROR; if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) BUF_KERNPROC(rbp); rbp->b_iooffset = dbtob(rbp->b_blkno); bstrategy(rbp); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rbp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } if (reqbp) { /* * Like bread, always brelse() the buffer when * returning an error. */ error = bufwait(reqbp); if (error != 0) { brelse(reqbp); *bpp = NULL; } } return (error); }
/* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_GETPAGES. */ int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int bytecount, int reqpage, vop_getpages_iodone_t iodone, void *arg) { vm_object_t object; struct bufobj *bo; struct buf *bp; daddr_t firstaddr, reqblock; off_t foff, pib; int pbefore, pafter, i, size, bsize, first, last, *freecnt; int count, error, before, after, secmask; KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("vnode_pager_generic_getpages does not support devices")); if (vp->v_iflag & VI_DOOMED) return (VM_PAGER_BAD); object = vp->v_object; count = bytecount / PAGE_SIZE; bsize = vp->v_mount->mnt_stat.f_iosize; /* * Synchronous and asynchronous paging operations use different * free pbuf counters. This is done to avoid asynchronous requests * to consume all pbufs. * Allocate the pbuf at the very beginning of the function, so that * if we are low on certain kind of pbufs don't even proceed to BMAP, * but sleep. */ freecnt = iodone != NULL ? &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt; bp = getpbuf(freecnt); /* * Get the underlying device blocks for the file with VOP_BMAP(). * If the file system doesn't support VOP_BMAP, use old way of * getting pages via VOP_READ. */ error = VOP_BMAP(vp, IDX_TO_OFF(m[reqpage]->pindex) / bsize, &bo, &reqblock, &after, &before); if (error == EOPNOTSUPP) { relpbuf(bp, freecnt); VM_OBJECT_WLOCK(object); for (i = 0; i < count; i++) if (i != reqpage) { vm_page_lock(m[i]); vm_page_free(m[i]); vm_page_unlock(m[i]); } PCPU_INC(cnt.v_vnodein); PCPU_INC(cnt.v_vnodepgsin); error = vnode_pager_input_old(object, m[reqpage]); VM_OBJECT_WUNLOCK(object); return (error); } else if (error != 0) { relpbuf(bp, freecnt); vm_pager_free_nonreq(object, m, reqpage, count, FALSE); return (VM_PAGER_ERROR); /* * If the blocksize is smaller than a page size, then use * special small filesystem code. */ } else if ((PAGE_SIZE / bsize) > 1) { relpbuf(bp, freecnt); vm_pager_free_nonreq(object, m, reqpage, count, FALSE); PCPU_INC(cnt.v_vnodein); PCPU_INC(cnt.v_vnodepgsin); return (vnode_pager_input_smlfs(object, m[reqpage])); } /* * Since the caller has busied the requested page, that page's valid * field will not be changed by other threads. */ vm_page_assert_xbusied(m[reqpage]); /* * If we have a completely valid page available to us, we can * clean up and return. Otherwise we have to re-read the * media. */ if (m[reqpage]->valid == VM_PAGE_BITS_ALL) { relpbuf(bp, freecnt); vm_pager_free_nonreq(object, m, reqpage, count, FALSE); return (VM_PAGER_OK); } else if (reqblock == -1) { relpbuf(bp, freecnt); pmap_zero_page(m[reqpage]); KASSERT(m[reqpage]->dirty == 0, ("vnode_pager_generic_getpages: page %p is dirty", m)); VM_OBJECT_WLOCK(object); m[reqpage]->valid = VM_PAGE_BITS_ALL; vm_pager_free_nonreq(object, m, reqpage, count, TRUE); VM_OBJECT_WUNLOCK(object); return (VM_PAGER_OK); } else if (m[reqpage]->valid != 0) { VM_OBJECT_WLOCK(object); m[reqpage]->valid = 0; VM_OBJECT_WUNLOCK(object); } pib = IDX_TO_OFF(m[reqpage]->pindex) % bsize; pbefore = ((daddr_t)before * bsize + pib) / PAGE_SIZE; pafter = ((daddr_t)(after + 1) * bsize - pib) / PAGE_SIZE - 1; first = reqpage < pbefore ? 0 : reqpage - pbefore; last = reqpage + pafter >= count ? count - 1 : reqpage + pafter; if (first > 0 || last + 1 < count) { VM_OBJECT_WLOCK(object); for (i = 0; i < first; i++) { vm_page_lock(m[i]); vm_page_free(m[i]); vm_page_unlock(m[i]); } for (i = last + 1; i < count; i++) { vm_page_lock(m[i]); vm_page_free(m[i]); vm_page_unlock(m[i]); } VM_OBJECT_WUNLOCK(object); } /* * here on direct device I/O */ firstaddr = reqblock; firstaddr += pib / DEV_BSIZE; firstaddr -= IDX_TO_OFF(reqpage - first) / DEV_BSIZE; /* * The first and last page have been calculated now, move * input pages to be zero based, and adjust the count. */ m += first; reqpage -= first; count = last - first + 1; /* * calculate the file virtual address for the transfer */ foff = IDX_TO_OFF(m[0]->pindex); /* * calculate the size of the transfer */ size = count * PAGE_SIZE; KASSERT(count > 0, ("zero count")); if ((foff + size) > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - foff; KASSERT(size > 0, ("zero size")); /* * round up physical size for real devices. */ secmask = bo->bo_bsize - 1; KASSERT(secmask < PAGE_SIZE && secmask > 0, ("vnode_pager_generic_getpages: sector size %d too large", secmask + 1)); size = (size + secmask) & ~secmask; /* * and map the pages to be read into the kva, if the filesystem * requires mapped buffers. */ if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, m, count); } /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); bp->b_blkno = firstaddr; pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = size; bp->b_bufsize = size; bp->b_runningbufspace = bp->b_bufsize; for (i = 0; i < count; i++) bp->b_pages[i] = m[i]; bp->b_npages = count; bp->b_pager.pg_reqpage = reqpage; atomic_add_long(&runningbufspace, bp->b_runningbufspace); PCPU_INC(cnt.v_vnodein); PCPU_ADD(cnt.v_vnodepgsin, count); /* do the input */ bp->b_iooffset = dbtob(bp->b_blkno); if (iodone != NULL) { /* async */ bp->b_pager.pg_iodone = iodone; bp->b_caller1 = arg; bp->b_iodone = vnode_pager_generic_getpages_done_async; bp->b_flags |= B_ASYNC; BUF_KERNPROC(bp); bstrategy(bp); /* Good bye! */ } else { bp->b_iodone = bdone; bstrategy(bp); bwait(bp, PVM, "vnread"); error = vnode_pager_generic_getpages_done(bp); for (i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_pbuf_freecnt); } return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); }