static int nvtruncbuf_bp_trunc(struct buf *bp, void *data) { struct truncbuf_info *info = data; /* * Do not try to use a buffer we cannot immediately lock, * but sleep anyway to prevent a livelock. The code will * loop until all buffers can be acted upon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { atomic_add_int(&bp->b_refs, 1); if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) BUF_UNLOCK(bp); atomic_subtract_int(&bp->b_refs, 1); } else if ((info->clean && (bp->b_flags & B_DELWRI)) || (info->clean == 0 && (bp->b_flags & B_DELWRI) == 0) || bp->b_vp != info->vp || nvtruncbuf_bp_trunc_cmp(bp, data)) { BUF_UNLOCK(bp); } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE); brelse(bp); } lwkt_yield(); return(1); }
/* * Assign a buffer for the given block. * * The block is selected from the buffer list with LRU * algorithm. If the appropriate block already exists in the * block list, return it. Otherwise, the least recently used * block is used. */ struct buf * getblk(dev_t dev, int blkno) { struct buf *bp; DPRINTF(VFSDB_BIO, ("getblk: dev=%llx blkno=%d\n", (long long)dev, blkno)); start: BIO_LOCK(); bp = incore(dev, blkno); if (bp != NULL) { /* Block found in cache. */ if (ISSET(bp->b_flags, B_BUSY)) { /* * Wait buffer ready. */ BIO_UNLOCK(); BUF_LOCK(bp); BUF_UNLOCK(bp); /* Scan again if it's busy */ goto start; } bio_remove(bp); SET(bp->b_flags, B_BUSY); } else { bp = bio_remove_head(); if (ISSET(bp->b_flags, B_DELWRI)) { BIO_UNLOCK(); bwrite(bp); goto start; } bp->b_flags = B_BUSY; bp->b_dev = dev; bp->b_blkno = blkno; } BUF_LOCK(bp); BIO_UNLOCK(); DPRINTF(VFSDB_BIO, ("getblk: done bp=%p\n", bp)); return bp; }
/* * Invalidate all buffers. * This is called when unmount. */ void bio_sync(void) { struct buf *bp; int i; start: BIO_LOCK(); for (i = 0; i < NBUFS; i++) { bp = &buf_table[i]; if (ISSET(bp->b_flags, B_BUSY)) { BIO_UNLOCK(); BUF_LOCK(bp); BUF_UNLOCK(bp); goto start; } if (ISSET(bp->b_flags, B_DELWRI)) bwrite(bp); } BIO_UNLOCK(); }
/* * Read data to a buf, including read-ahead if we find this to be beneficial. * cluster_read replaces bread. */ int cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, struct ucred *cred, long totread, int seqcount, int gbflags, struct buf **bpp) { struct buf *bp, *rbp, *reqbp; struct bufobj *bo; daddr_t blkno, origblkno; int maxra, racluster; int error, ncontig; int i; error = 0; bo = &vp->v_bufobj; if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; /* * Try to limit the amount of read-ahead by a few * ad-hoc parameters. This needs work!!! */ racluster = vp->v_mount->mnt_iosize_max / size; maxra = seqcount; maxra = min(read_max, maxra); maxra = min(nbuf/8, maxra); if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) maxra = (filesize / size) - lblkno; /* * get the requested block */ *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags); if (bp == NULL) return (EBUSY); origblkno = lblkno; /* * if it is in the cache, then check to see if the reads have been * sequential. If they have, then try some read-ahead, otherwise * back-off on prospective read-aheads. */ if (bp->b_flags & B_CACHE) { if (!seqcount) { return 0; } else if ((bp->b_flags & B_RAM) == 0) { return 0; } else { bp->b_flags &= ~B_RAM; BO_RLOCK(bo); for (i = 1; i < maxra; i++) { /* * Stop if the buffer does not exist or it * is invalid (about to go away?) */ rbp = gbincore(&vp->v_bufobj, lblkno+i); if (rbp == NULL || (rbp->b_flags & B_INVAL)) break; /* * Set another read-ahead mark so we know * to check again. (If we can lock the * buffer without waiting) */ if ((((i % racluster) == (racluster - 1)) || (i == (maxra - 1))) && (0 == BUF_LOCK(rbp, LK_EXCLUSIVE | LK_NOWAIT, NULL))) { rbp->b_flags |= B_RAM; BUF_UNLOCK(rbp); } } BO_RUNLOCK(bo); if (i >= maxra) { return 0; } lblkno += i; } reqbp = bp = NULL; /* * If it isn't in the cache, then get a chunk from * disk if sequential, otherwise just get the block. */ } else { off_t firstread = bp->b_offset; int nblks; long minread; KASSERT(bp->b_offset != NOOFFSET, ("cluster_read: no buffer offset")); ncontig = 0; /* * Adjust totread if needed */ minread = read_min * size; if (minread > totread) totread = minread; /* * Compute the total number of blocks that we should read * synchronously. */ if (firstread + totread > filesize) totread = filesize - firstread; nblks = howmany(totread, size); if (nblks > racluster) nblks = racluster; /* * Now compute the number of contiguous blocks. */ if (nblks > 1) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); /* * If this failed to map just do the original block. */ if (error || blkno == -1) ncontig = 0; } /* * If we have contiguous data available do a cluster * otherwise just read the requested block. */ if (ncontig) { /* Account for our first block. */ ncontig = min(ncontig + 1, nblks); if (ncontig < nblks) nblks = ncontig; bp = cluster_rbuild(vp, filesize, lblkno, blkno, size, nblks, gbflags, bp); lblkno += (bp->b_bufsize / size); } else { bp->b_flags |= B_RAM; bp->b_iocmd = BIO_READ; lblkno += 1; } } /* * handle the synchronous read so that it is available ASAP. */ if (bp) { if ((bp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(bp, 0); } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } /* * If we have been doing sequential I/O, then do some read-ahead. */ while (lblkno < (origblkno + maxra)) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); if (error) break; if (blkno == -1) break; /* * We could throttle ncontig here by maxra but we might as * well read the data if it is contiguous. We're throttled * by racluster anyway. */ if (ncontig) { ncontig = min(ncontig + 1, racluster); rbp = cluster_rbuild(vp, filesize, lblkno, blkno, size, ncontig, gbflags, NULL); lblkno += (rbp->b_bufsize / size); if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } } else { rbp = getblk(vp, lblkno, size, 0, 0, gbflags); lblkno += 1; if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } rbp->b_flags |= B_ASYNC | B_RAM; rbp->b_iocmd = BIO_READ; rbp->b_blkno = blkno; } if (rbp->b_flags & B_CACHE) { rbp->b_flags &= ~B_ASYNC; bqrelse(rbp); continue; } if ((rbp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(rbp, 0); } rbp->b_flags &= ~B_INVAL; rbp->b_ioflags &= ~BIO_ERROR; if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) BUF_KERNPROC(rbp); rbp->b_iooffset = dbtob(rbp->b_blkno); bstrategy(rbp); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rbp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; } if (reqbp) { /* * Like bread, always brelse() the buffer when * returning an error. */ error = bufwait(reqbp); if (error != 0) { brelse(reqbp); *bpp = NULL; } } return (error); }
/* Perform I/O on a subdisk */ void sdio(struct buf *bp) { int s; /* spl */ struct sd *sd; struct sdbuf *sbp; daddr_t endoffset; struct drive *drive; #if VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_sdio, (union rqinfou) bp, bp); #endif sd = &SD[Sdno(bp->b_dev)]; /* point to the subdisk */ drive = &DRIVE[sd->driveno]; if (drive->state != drive_up) { if (sd->state >= sd_crashed) { if (bp->b_flags & B_READ) /* reading, */ set_sd_state(sd->sdno, sd_crashed, setstate_force); else set_sd_state(sd->sdno, sd_stale, setstate_force); } bp->b_flags |= B_ERROR; bp->b_error = EIO; biodone(bp); return; } /* * We allow access to any kind of subdisk as long as we can expect * to get the I/O performed. */ if (sd->state < sd_empty) { /* nothing to talk to, */ bp->b_flags |= B_ERROR; bp->b_error = EIO; biodone(bp); return; } /* Get a buffer */ sbp = (struct sdbuf *) Malloc(sizeof(struct sdbuf)); if (sbp == NULL) { bp->b_flags |= B_ERROR; bp->b_error = ENOMEM; biodone(bp); return; } bzero(sbp, sizeof(struct sdbuf)); /* start with nothing */ sbp->b.b_flags = bp->b_flags | B_CALL; /* inform us when it's done */ sbp->b.b_bufsize = bp->b_bufsize; /* buffer size */ sbp->b.b_bcount = bp->b_bcount; /* number of bytes to transfer */ sbp->b.b_resid = bp->b_resid; /* and amount waiting */ sbp->b.b_dev = DRIVE[sd->driveno].dev; /* device */ sbp->b.b_data = bp->b_data; /* data buffer */ sbp->b.b_blkno = bp->b_blkno + sd->driveoffset; sbp->b.b_iodone = sdio_done; /* come here on completion */ BUF_LOCKINIT(&sbp->b); /* get a lock for the buffer */ BUF_LOCK(&sbp->b, LK_EXCLUSIVE); /* and lock it */ sbp->bp = bp; /* note the address of the original header */ sbp->sdno = sd->sdno; /* note for statistics */ sbp->driveno = sd->driveno; endoffset = bp->b_blkno + sbp->b.b_bcount / DEV_BSIZE; /* final sector offset */ if (endoffset > sd->sectors) { /* beyond the end */ sbp->b.b_bcount -= (endoffset - sd->sectors) * DEV_BSIZE; /* trim */ if (sbp->b.b_bcount <= 0) { /* nothing to transfer */ bp->b_resid = bp->b_bcount; /* nothing transferred */ biodone(bp); Free(sbp); return; } } #if VINUMDEBUG if (debug & DEBUG_ADDRESSES) log(LOG_DEBUG, " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", sbp->b.b_flags & B_READ ? "Read" : "Write", major(sbp->b.b_dev), minor(sbp->b.b_dev), sbp->sdno, (u_int) (sbp->b.b_blkno - SD[sbp->sdno].driveoffset), (int) sbp->b.b_blkno, sbp->b.b_bcount); #endif s = splbio(); #if VINUMDEBUG if (debug & DEBUG_LASTREQS) logrq(loginfo_sdiol, (union rqinfou) &sbp->b, &sbp->b); #endif BUF_STRATEGY(&sbp->b, 0); splx(s); }
/* Fill in the struct buf part of a request element. */ enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex) { struct sd *sd; /* point to subdisk */ struct volume *vol; struct buf *bp; struct buf *ubp; /* user (high level) buffer header */ vol = &VOL[rqe->rqg->rq->volplex.volno]; sd = &SD[rqe->sdno]; /* point to subdisk */ bp = &rqe->b; ubp = rqe->rqg->rq->bp; /* pointer to user buffer header */ /* Initialize the buf struct */ /* copy these flags from user bp */ bp->b_flags = ubp->b_flags & (B_ORDERED | B_NOCACHE | B_READ | B_ASYNC); bp->b_flags |= B_CALL; /* inform us when it's done */ BUF_LOCKINIT(bp); /* get a lock for the buffer */ BUF_LOCK(bp, LK_EXCLUSIVE); /* and lock it */ bp->b_iodone = complete_rqe; /* by calling us here */ /* * You'd think that we wouldn't need to even * build the request buffer for a dead subdisk, * but in some cases we need information like * the user buffer address. Err on the side of * generosity and supply what we can. That * obviously doesn't include drive information * when the drive is dead. */ if ((rqe->flags & XFR_BAD_SUBDISK) == 0) { /* subdisk is accessible, */ bp->b_dev = DRIVE[rqe->driveno].dev; /* drive device */ } bp->b_blkno = rqe->sdoffset + sd->driveoffset; /* start address */ bp->b_bcount = rqe->buflen << DEV_BSHIFT; /* number of bytes to transfer */ bp->b_resid = bp->b_bcount; /* and it's still all waiting */ bp->b_bufsize = bp->b_bcount; /* and buffer size */ bp->b_rcred = FSCRED; /* we have the file system credentials */ bp->b_wcred = FSCRED; /* we have the file system credentials */ if (rqe->flags & XFR_MALLOCED) { /* this operation requires a malloced buffer */ bp->b_data = Malloc(bp->b_bcount); /* get a buffer to put it in */ if (bp->b_data == NULL) { /* failed */ abortrequest(rqe->rqg->rq, ENOMEM); return REQUEST_ENOMEM; /* no memory */ } } else /* * Point directly to user buffer data. This means * that we don't need to do anything when we have * finished the transfer */ bp->b_data = ubp->b_data + rqe->useroffset * DEV_BSIZE; /* * On a recovery read, we perform an XOR of * all blocks to the user buffer. To make * this work, we first clean out the buffer */ if ((rqe->flags & (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) == (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) { /* bad subdisk of a recovery read */ int length = rqe->grouplen << DEV_BSHIFT; /* and count involved */ char *data = (char *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* destination */ bzero(data, length); /* clean it out */ } return 0; }
/* * Convert a vnode strategy call into a device strategy call. Vnode strategy * calls are not limited to device DMA limits so we have to deal with the * case. * * spec_strategy(struct vnode *a_vp, struct bio *a_bio) */ static int devfs_spec_strategy(struct vop_strategy_args *ap) { struct bio *bio = ap->a_bio; struct buf *bp = bio->bio_buf; struct buf *nbp; struct vnode *vp; struct mount *mp; int chunksize; int maxiosize; if (bp->b_cmd != BUF_CMD_READ && LIST_FIRST(&bp->b_dep) != NULL) buf_start(bp); /* * Collect statistics on synchronous and asynchronous read * and write counts for disks that have associated filesystems. */ vp = ap->a_vp; KKASSERT(vp->v_rdev != NULL); /* XXX */ if (vn_isdisk(vp, NULL) && (mp = vp->v_rdev->si_mountpoint) != NULL) { if (bp->b_cmd == BUF_CMD_READ) { if (bp->b_flags & BIO_SYNC) mp->mnt_stat.f_syncreads++; else mp->mnt_stat.f_asyncreads++; } else { if (bp->b_flags & BIO_SYNC) mp->mnt_stat.f_syncwrites++; else mp->mnt_stat.f_asyncwrites++; } } /* * Device iosize limitations only apply to read and write. Shortcut * the I/O if it fits. */ if ((maxiosize = vp->v_rdev->si_iosize_max) == 0) { devfs_debug(DEVFS_DEBUG_DEBUG, "%s: si_iosize_max not set!\n", dev_dname(vp->v_rdev)); maxiosize = MAXPHYS; } #if SPEC_CHAIN_DEBUG & 2 maxiosize = 4096; #endif if (bp->b_bcount <= maxiosize || (bp->b_cmd != BUF_CMD_READ && bp->b_cmd != BUF_CMD_WRITE)) { dev_dstrategy_chain(vp->v_rdev, bio); return (0); } /* * Clone the buffer and set up an I/O chain to chunk up the I/O. */ nbp = kmalloc(sizeof(*bp), M_DEVBUF, M_INTWAIT|M_ZERO); initbufbio(nbp); buf_dep_init(nbp); BUF_LOCK(nbp, LK_EXCLUSIVE); BUF_KERNPROC(nbp); nbp->b_vp = vp; nbp->b_flags = B_PAGING | (bp->b_flags & B_BNOCLIP); nbp->b_data = bp->b_data; nbp->b_bio1.bio_done = devfs_spec_strategy_done; nbp->b_bio1.bio_offset = bio->bio_offset; nbp->b_bio1.bio_caller_info1.ptr = bio; /* * Start the first transfer */ if (vn_isdisk(vp, NULL)) chunksize = vp->v_rdev->si_bsize_phys; else chunksize = DEV_BSIZE; chunksize = maxiosize / chunksize * chunksize; #if SPEC_CHAIN_DEBUG & 1 devfs_debug(DEVFS_DEBUG_DEBUG, "spec_strategy chained I/O chunksize=%d\n", chunksize); #endif nbp->b_cmd = bp->b_cmd; nbp->b_bcount = chunksize; nbp->b_bufsize = chunksize; /* used to detect a short I/O */ nbp->b_bio1.bio_caller_info2.index = chunksize; #if SPEC_CHAIN_DEBUG & 1 devfs_debug(DEVFS_DEBUG_DEBUG, "spec_strategy: chain %p offset %d/%d bcount %d\n", bp, 0, bp->b_bcount, nbp->b_bcount); #endif dev_dstrategy(vp->v_rdev, &nbp->b_bio1); if (DEVFS_NODE(vp)) { nanotime(&DEVFS_NODE(vp)->atime); nanotime(&DEVFS_NODE(vp)->mtime); } return (0); }