Пример #1
0
/*
 * If blocks are contiguous on disk, use this to provide clustered
 * read ahead.  We will read as many blocks as possible sequentially
 * and then parcel them up into logical blocks in the buffer hash table.
 */
static struct buf *
cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
    daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
{
	struct bufobj *bo;
	struct buf *bp, *tbp;
	daddr_t bn;
	off_t off;
	long tinc, tsize;
	int i, inc, j, toff;

	KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
	    ("cluster_rbuild: size %ld != filesize %jd\n",
	    size, (intmax_t)vp->v_mount->mnt_stat.f_iosize));

	/*
	 * avoid a division
	 */
	while ((u_quad_t) size * (lbn + run) > filesize) {
		--run;
	}

	if (fbp) {
		tbp = fbp;
		tbp->b_iocmd = BIO_READ; 
	} else {
		tbp = getblk(vp, lbn, size, 0, 0, gbflags);
		if (tbp->b_flags & B_CACHE)
			return tbp;
		tbp->b_flags |= B_ASYNC | B_RAM;
		tbp->b_iocmd = BIO_READ;
	}
	tbp->b_blkno = blkno;
	if( (tbp->b_flags & B_MALLOC) ||
		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
		return tbp;

	bp = trypbuf(&cluster_pbuf_freecnt);
	if (bp == 0)
		return tbp;

	/*
	 * We are synthesizing a buffer out of vm_page_t's, but
	 * if the block size is not page aligned then the starting
	 * address may not be either.  Inherit the b_data offset
	 * from the original buffer.
	 */
	bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
	if ((gbflags & GB_UNMAPPED) != 0) {
		bp->b_flags |= B_UNMAPPED;
		bp->b_data = unmapped_buf;
	} else {
		bp->b_data = (char *)((vm_offset_t)bp->b_data |
		    ((vm_offset_t)tbp->b_data & PAGE_MASK));
	}
	bp->b_iocmd = BIO_READ;
	bp->b_iodone = cluster_callback;
	bp->b_blkno = blkno;
	bp->b_lblkno = lbn;
	bp->b_offset = tbp->b_offset;
	KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
	pbgetvp(vp, bp);

	TAILQ_INIT(&bp->b_cluster.cluster_head);

	bp->b_bcount = 0;
	bp->b_bufsize = 0;
	bp->b_npages = 0;

	inc = btodb(size);
	bo = &vp->v_bufobj;
	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
		if (i != 0) {
			if ((bp->b_npages * PAGE_SIZE) +
			    round_page(size) > vp->v_mount->mnt_iosize_max) {
				break;
			}

			tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT |
			    (gbflags & GB_UNMAPPED));

			/* Don't wait around for locked bufs. */
			if (tbp == NULL)
				break;

			/*
			 * Stop scanning if the buffer is fully valid
			 * (marked B_CACHE), or locked (may be doing a
			 * background write), or if the buffer is not
			 * VMIO backed.  The clustering code can only deal
			 * with VMIO-backed buffers.
			 */
			BO_LOCK(bo);
			if ((tbp->b_vflags & BV_BKGRDINPROG) ||
			    (tbp->b_flags & B_CACHE) ||
			    (tbp->b_flags & B_VMIO) == 0) {
				BO_UNLOCK(bo);
				bqrelse(tbp);
				break;
			}
			BO_UNLOCK(bo);

			/*
			 * The buffer must be completely invalid in order to
			 * take part in the cluster.  If it is partially valid
			 * then we stop.
			 */
			off = tbp->b_offset;
			tsize = size;
			VM_OBJECT_LOCK(tbp->b_bufobj->bo_object);
			for (j = 0; tsize > 0; j++) {
				toff = off & PAGE_MASK;
				tinc = tsize;
				if (toff + tinc > PAGE_SIZE)
					tinc = PAGE_SIZE - toff;
				VM_OBJECT_LOCK_ASSERT(tbp->b_pages[j]->object,
				    MA_OWNED);
				if ((tbp->b_pages[j]->valid &
				    vm_page_bits(toff, tinc)) != 0)
					break;
				off += tinc;
				tsize -= tinc;
			}
			VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object);
			if (tsize > 0) {
				bqrelse(tbp);
				break;
			}

			/*
			 * Set a read-ahead mark as appropriate
			 */
			if ((fbp && (i == 1)) || (i == (run - 1)))
				tbp->b_flags |= B_RAM;

			/*
			 * Set the buffer up for an async read (XXX should
			 * we do this only if we do not wind up brelse()ing?).
			 * Set the block number if it isn't set, otherwise
			 * if it is make sure it matches the block number we
			 * expect.
			 */
			tbp->b_flags |= B_ASYNC;
			tbp->b_iocmd = BIO_READ;
			if (tbp->b_blkno == tbp->b_lblkno) {
				tbp->b_blkno = bn;
			} else if (tbp->b_blkno != bn) {
				brelse(tbp);
				break;
			}
		}
		/*
		 * XXX fbp from caller may not be B_ASYNC, but we are going
		 * to biodone() it in cluster_callback() anyway
		 */
		BUF_KERNPROC(tbp);
		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
			tbp, b_cluster.cluster_entry);
		VM_OBJECT_LOCK(tbp->b_bufobj->bo_object);
		for (j = 0; j < tbp->b_npages; j += 1) {
			vm_page_t m;
			m = tbp->b_pages[j];
			vm_page_io_start(m);
			vm_object_pip_add(m->object, 1);
			if ((bp->b_npages == 0) ||
				(bp->b_pages[bp->b_npages-1] != m)) {
				bp->b_pages[bp->b_npages] = m;
				bp->b_npages++;
			}
			if (m->valid == VM_PAGE_BITS_ALL)
				tbp->b_pages[j] = bogus_page;
		}
		VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object);
		/*
		 * Don't inherit tbp->b_bufsize as it may be larger due to
		 * a non-page-aligned size.  Instead just aggregate using
		 * 'size'.
		 */
		if (tbp->b_bcount != size)
			printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
		if (tbp->b_bufsize != size)
			printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
		bp->b_bcount += size;
		bp->b_bufsize += size;
	}

	/*
	 * Fully valid pages in the cluster are already good and do not need
	 * to be re-read from disk.  Replace the page with bogus_page
	 */
	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
	for (j = 0; j < bp->b_npages; j++) {
		VM_OBJECT_LOCK_ASSERT(bp->b_pages[j]->object, MA_OWNED);
		if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL)
			bp->b_pages[j] = bogus_page;
	}
	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
	if (bp->b_bufsize > bp->b_kvasize)
		panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
		    bp->b_bufsize, bp->b_kvasize);
	bp->b_kvasize = bp->b_bufsize;

	if ((bp->b_flags & B_UNMAPPED) == 0) {
		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
		    (vm_page_t *)bp->b_pages, bp->b_npages);
	}
	return (bp);
}
/*
 * This is now called from local media FS's to operate against their
 * own vnodes if they fail to implement VOP_GETPAGES.
 *
 * With all the caching local media devices do these days there is really
 * very little point to attempting to restrict the I/O size to contiguous
 * blocks on-disk, especially if our caller thinks we need all the specified
 * pages.  Just construct and issue a READ.
 */
int
vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *mpp, int bytecount,
			     int reqpage, int seqaccess)
{
	struct iovec aiov;
	struct uio auio;
	off_t foff;
	int error;
	int count;
	int i;
	int ioflags;

	/*
	 * Do not do anything if the vnode is bad.
	 */
	if (vp->v_mount == NULL)
		return VM_PAGER_BAD;

	/*
	 * Calculate the number of pages.  Since we are paging in whole
	 * pages, adjust bytecount to be an integral multiple of the page
	 * size.  It will be clipped to the file EOF later on.
	 */
	bytecount = round_page(bytecount);
	count = bytecount / PAGE_SIZE;

	/*
	 * We could check m[reqpage]->valid here and shortcut the operation,
	 * but doing so breaks read-ahead.  Instead assume that the VM
	 * system has already done at least the check, don't worry about
	 * any races, and issue the VOP_READ to allow read-ahead to function.
	 *
	 * This keeps the pipeline full for I/O bound sequentially scanned
	 * mmap()'s
	 */
	/* don't shortcut */

	/*
	 * Discard pages past the file EOF.  If the requested page is past
	 * the file EOF we just leave its valid bits set to 0, the caller
	 * expects to maintain ownership of the requested page.  If the
	 * entire range is past file EOF discard everything and generate
	 * a pagein error.
	 */
	foff = IDX_TO_OFF(mpp[0]->pindex);
	if (foff >= vp->v_filesize) {
		for (i = 0; i < count; i++) {
			if (i != reqpage)
				vnode_pager_freepage(mpp[i]);
		}
		return VM_PAGER_ERROR;
	}

	if (foff + bytecount > vp->v_filesize) {
		bytecount = vp->v_filesize - foff;
		i = round_page(bytecount) / PAGE_SIZE;
		while (count > i) {
			--count;
			if (count != reqpage)
				vnode_pager_freepage(mpp[count]);
		}
	}

	/*
	 * The size of the transfer is bytecount.  bytecount will be an
	 * integral multiple of the page size unless it has been clipped
	 * to the file EOF.  The transfer cannot exceed the file EOF.
	 *
	 * When dealing with real devices we must round-up to the device
	 * sector size.
	 */
	if (vp->v_type == VBLK || vp->v_type == VCHR) {
		int secmask = vp->v_rdev->si_bsize_phys - 1;
		KASSERT(secmask < PAGE_SIZE, ("vnode_pager_generic_getpages: sector size %d too large", secmask + 1));
		bytecount = (bytecount + secmask) & ~secmask;
	}

	/*
	 * Severe hack to avoid deadlocks with the buffer cache
	 */
	for (i = 0; i < count; ++i) {
		vm_page_t mt = mpp[i];

		vm_page_io_start(mt);
		vm_page_wakeup(mt);
	}

	/*
	 * Issue the I/O with some read-ahead if bytecount > PAGE_SIZE
	 */
	ioflags = IO_VMIO;
	if (seqaccess)
		ioflags |= IO_SEQMAX << IO_SEQSHIFT;

	aiov.iov_base = NULL;
	aiov.iov_len = bytecount;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = foff;
	auio.uio_segflg = UIO_NOCOPY;
	auio.uio_rw = UIO_READ;
	auio.uio_resid = bytecount;
	auio.uio_td = NULL;
	mycpu->gd_cnt.v_vnodein++;
	mycpu->gd_cnt.v_vnodepgsin += count;

	error = VOP_READ(vp, &auio, ioflags, proc0.p_ucred);

	/*
	 * Severe hack to avoid deadlocks with the buffer cache
	 */
	for (i = 0; i < count; ++i) {
		vm_page_busy_wait(mpp[i], FALSE, "getpgs");
		vm_page_io_finish(mpp[i]);
	}

	/*
	 * Calculate the actual number of bytes read and clean up the
	 * page list.  
	 */
	bytecount -= auio.uio_resid;

	for (i = 0; i < count; ++i) {
		vm_page_t mt = mpp[i];

		if (i != reqpage) {
			if (error == 0 && mt->valid) {
				if (mt->flags & PG_REFERENCED)
					vm_page_activate(mt);
				else
					vm_page_deactivate(mt);
				vm_page_wakeup(mt);
			} else {
				vnode_pager_freepage(mt);
			}
		} else if (mt->valid == 0) {
			if (error == 0) {
				kprintf("page failed but no I/O error page "
					"%p object %p pindex %d\n",
					mt, mt->object, (int) mt->pindex);
				/* whoops, something happened */
				error = EINVAL;
			}
		} else if (mt->valid != VM_PAGE_BITS_ALL) {
			/*
			 * Zero-extend the requested page if necessary (if
			 * the filesystem is using a small block size).
			 */
			vm_page_zero_invalid(mt, TRUE);
		}
	}
	if (error) {
		kprintf("vnode_pager_getpage: I/O read error\n");
	}
	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
}