/* * If there aren't too many streams already, create a new stream. * The "blkid" argument is the next block that we expect this stream to access. * While we're here, clean up old streams (which haven't been * accessed for at least zfetch_min_sec_reap seconds). */ static void dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) { zstream_t *zs; zstream_t *zs_next; int numstreams = 0; uint32_t max_streams; ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); /* * Clean up old streams. */ for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { zs_next = list_next(&zf->zf_stream, zs); if (((gethrtime() - zs->zs_atime) / NANOSEC) > zfetch_min_sec_reap) dmu_zfetch_stream_remove(zf, zs); else numstreams++; } /* * The maximum number of streams is normally zfetch_max_streams, * but for small files we lower it such that it's at least possible * for all the streams to be non-overlapping. * * If we are already at the maximum number of streams for this file, * even after removing old streams, then don't create this stream. */ max_streams = MAX(1, MIN(zfetch_max_streams, zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / zfetch_max_distance)); if (numstreams >= max_streams) { ZFETCHSTAT_BUMP(zfetchstat_max_streams); return; } zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); zs->zs_blkid = blkid; zs->zs_pf_blkid = blkid; zs->zs_ipf_blkid = blkid; zs->zs_atime = gethrtime(); mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); list_insert_head(&zf->zf_stream, zs); }
/* * This is the predictive prefetch entry point. It associates dnode access * specified with blkid and nblks arguments with prefetch stream, predicts * further accesses based on that stats and initiates speculative prefetch. * fetch_data argument specifies whether actual data blocks should be fetched: * FALSE -- prefetch only indirect blocks for predicted data blocks; * TRUE -- prefetch predicted data blocks plus following indirect blocks. */ void dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) { zstream_t *zs; int64_t pf_start, ipf_start, ipf_istart, ipf_iend; int64_t pf_ahead_blks, max_blks; int epbs, max_dist_blks, pf_nblks, ipf_nblks; uint64_t end_of_access_blkid = blkid + nblks; if (zfs_prefetch_disable) return; /* * As a fast path for small (single-block) files, ignore access * to the first block. */ if (blkid == 0) return; rw_enter(&zf->zf_rwlock, RW_READER); for (zs = list_head(&zf->zf_stream); zs != NULL; zs = list_next(&zf->zf_stream, zs)) { if (blkid == zs->zs_blkid) { mutex_enter(&zs->zs_lock); /* * zs_blkid could have changed before we * acquired zs_lock; re-check them here. */ if (blkid != zs->zs_blkid) { mutex_exit(&zs->zs_lock); continue; } break; } } if (zs == NULL) { /* * This access is not part of any existing stream. Create * a new stream for it. */ ZFETCHSTAT_BUMP(zfetchstat_misses); if (rw_tryupgrade(&zf->zf_rwlock)) dmu_zfetch_stream_create(zf, end_of_access_blkid); rw_exit(&zf->zf_rwlock); return; } /* * This access was to a block that we issued a prefetch for on * behalf of this stream. Issue further prefetches for this stream. * * Normally, we start prefetching where we stopped * prefetching last (zs_pf_blkid). But when we get our first * hit on this stream, zs_pf_blkid == zs_blkid, we don't * want to prefetch the block we just accessed. In this case, * start just after the block we just accessed. */ pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid); /* * Double our amount of prefetched data, but don't let the * prefetch get further ahead than zfetch_max_distance. */ if (fetch_data) { max_dist_blks = zfetch_max_distance >> zf->zf_dnode->dn_datablkshift; /* * Previously, we were (zs_pf_blkid - blkid) ahead. We * want to now be double that, so read that amount again, * plus the amount we are catching up by (i.e. the amount * read just now). */ pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks; max_blks = max_dist_blks - (pf_start - end_of_access_blkid); pf_nblks = MIN(pf_ahead_blks, max_blks); } else {
/* * This is the predictive prefetch entry point. It associates dnode access * specified with blkid and nblks arguments with prefetch stream, predicts * further accesses based on that stats and initiates speculative prefetch. * fetch_data argument specifies whether actual data blocks should be fetched: * FALSE -- prefetch only indirect blocks for predicted data blocks; * TRUE -- prefetch predicted data blocks plus following indirect blocks. */ void dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) { zstream_t *zs; int64_t pf_start, ipf_start, ipf_istart, ipf_iend; int64_t pf_ahead_blks, max_blks; int epbs, max_dist_blks, pf_nblks, ipf_nblks; uint64_t end_of_access_blkid = blkid + nblks; spa_t *spa = zf->zf_dnode->dn_objset->os_spa; if (zfs_prefetch_disable) return; /* * If we haven't yet loaded the indirect vdevs' mappings, we * can only read from blocks that we carefully ensure are on * concrete vdevs (or previously-loaded indirect vdevs). So we * can't allow the predictive prefetcher to attempt reads of other * blocks (e.g. of the MOS's dnode obejct). */ if (!spa_indirect_vdevs_loaded(spa)) return; /* * As a fast path for small (single-block) files, ignore access * to the first block. */ if (blkid == 0) return; rw_enter(&zf->zf_rwlock, RW_READER); /* * Find matching prefetch stream. Depending on whether the accesses * are block-aligned, first block of the new access may either follow * the last block of the previous access, or be equal to it. */ for (zs = list_head(&zf->zf_stream); zs != NULL; zs = list_next(&zf->zf_stream, zs)) { if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) { mutex_enter(&zs->zs_lock); /* * zs_blkid could have changed before we * acquired zs_lock; re-check them here. */ if (blkid == zs->zs_blkid) { break; } else if (blkid + 1 == zs->zs_blkid) { blkid++; nblks--; if (nblks == 0) { /* Already prefetched this before. */ mutex_exit(&zs->zs_lock); rw_exit(&zf->zf_rwlock); return; } break; } mutex_exit(&zs->zs_lock); } } if (zs == NULL) { /* * This access is not part of any existing stream. Create * a new stream for it. */ ZFETCHSTAT_BUMP(zfetchstat_misses); if (rw_tryupgrade(&zf->zf_rwlock)) dmu_zfetch_stream_create(zf, end_of_access_blkid); rw_exit(&zf->zf_rwlock); return; } /* * This access was to a block that we issued a prefetch for on * behalf of this stream. Issue further prefetches for this stream. * * Normally, we start prefetching where we stopped * prefetching last (zs_pf_blkid). But when we get our first * hit on this stream, zs_pf_blkid == zs_blkid, we don't * want to prefetch the block we just accessed. In this case, * start just after the block we just accessed. */ pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid); /* * Double our amount of prefetched data, but don't let the * prefetch get further ahead than zfetch_max_distance. */ if (fetch_data) { max_dist_blks = zfetch_max_distance >> zf->zf_dnode->dn_datablkshift; /* * Previously, we were (zs_pf_blkid - blkid) ahead. We * want to now be double that, so read that amount again, * plus the amount we are catching up by (i.e. the amount * read just now). */ pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks; max_blks = max_dist_blks - (pf_start - end_of_access_blkid); pf_nblks = MIN(pf_ahead_blks, max_blks); } else {
/* * This is the prefetch entry point. It calls all of the other dmu_zfetch * routines to create, delete, find, or operate upon prefetch streams. */ void dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) { zstream_t zst; zstream_t *newstream; boolean_t fetched; int inserted; unsigned int blkshft; uint64_t blksz; if (zfs_prefetch_disable) return; /* files that aren't ln2 blocksz are only one block -- nothing to do */ if (!zf->zf_dnode->dn_datablkshift) return; /* convert offset and size, into blockid and nblocks */ blkshft = zf->zf_dnode->dn_datablkshift; blksz = (1 << blkshft); bzero(&zst, sizeof (zstream_t)); zst.zst_offset = offset >> blkshft; zst.zst_len = (P2ROUNDUP(offset + size, blksz) - P2ALIGN(offset, blksz)) >> blkshft; fetched = dmu_zfetch_find(zf, &zst, prefetched); if (fetched) { ZFETCHSTAT_BUMP(zfetchstat_hits); } else { ZFETCHSTAT_BUMP(zfetchstat_misses); if ((fetched = dmu_zfetch_colinear(zf, &zst))) { ZFETCHSTAT_BUMP(zfetchstat_colinear_hits); } else { ZFETCHSTAT_BUMP(zfetchstat_colinear_misses); } } if (!fetched) { newstream = dmu_zfetch_stream_reclaim(zf); /* * we still couldn't find a stream, drop the lock, and allocate * one if possible. Otherwise, give up and go home. */ if (newstream) { ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes); } else { uint64_t maxblocks; uint32_t max_streams; uint32_t cur_streams; ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures); cur_streams = zf->zf_stream_cnt; maxblocks = zf->zf_dnode->dn_maxblkid; max_streams = MIN(zfetch_max_streams, (maxblocks / zfetch_block_cap)); if (max_streams == 0) { max_streams++; } if (cur_streams >= max_streams) { return; } newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); } newstream->zst_offset = zst.zst_offset; newstream->zst_len = zst.zst_len; newstream->zst_stride = zst.zst_len; newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; newstream->zst_cap = zst.zst_len; newstream->zst_direction = ZFETCH_FORWARD; newstream->zst_last = ddi_get_lbolt(); mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); rw_enter(&zf->zf_rwlock, RW_WRITER); inserted = dmu_zfetch_stream_insert(zf, newstream); rw_exit(&zf->zf_rwlock); if (!inserted) { mutex_destroy(&newstream->zst_lock); kmem_free(newstream, sizeof (zstream_t)); } } }
/* * given a zfetch and a zstream structure, see if there is an associated zstream * for this block read. If so, it starts a prefetch for the stream it * located and returns true, otherwise it returns false */ static boolean_t dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) { zstream_t *zs; int64_t diff; int reset = !prefetched; int rc = 0; if (zh == NULL) return (0); /* * XXX: This locking strategy is a bit coarse; however, it's impact has * yet to be tested. If this turns out to be an issue, it can be * modified in a number of different ways. */ rw_enter(&zf->zf_rwlock, RW_READER); top: for (zs = list_head(&zf->zf_stream); zs; zs = list_next(&zf->zf_stream, zs)) { /* * XXX - should this be an assert? */ if (zs->zst_len == 0) { /* bogus stream */ ZFETCHSTAT_BUMP(zfetchstat_bogus_streams); continue; } /* * We hit this case when we are in a strided prefetch stream: * we will read "len" blocks before "striding". */ if (zh->zst_offset >= zs->zst_offset && zh->zst_offset < zs->zst_offset + zs->zst_len) { if (prefetched) { /* already fetched */ ZFETCHSTAT_BUMP(zfetchstat_stride_hits); rc = 1; goto out; } else { ZFETCHSTAT_BUMP(zfetchstat_stride_misses); } } /* * This is the forward sequential read case: we increment * len by one each time we hit here, so we will enter this * case on every read. */ if (zh->zst_offset == zs->zst_offset + zs->zst_len) { reset = !prefetched && zs->zst_len > 1; mutex_enter(&zs->zst_lock); if (zh->zst_offset != zs->zst_offset + zs->zst_len) { mutex_exit(&zs->zst_lock); goto top; } zs->zst_len += zh->zst_len; diff = zs->zst_len - zfetch_block_cap; if (diff > 0) { zs->zst_offset += diff; zs->zst_len = zs->zst_len > diff ? zs->zst_len - diff : 0; } zs->zst_direction = ZFETCH_FORWARD; break; /* * Same as above, but reading backwards through the file. */ } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { /* backwards sequential access */ reset = !prefetched && zs->zst_len > 1; mutex_enter(&zs->zst_lock); if (zh->zst_offset != zs->zst_offset - zh->zst_len) { mutex_exit(&zs->zst_lock); goto top; } zs->zst_offset = zs->zst_offset > zh->zst_len ? zs->zst_offset - zh->zst_len : 0; zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ? zs->zst_ph_offset - zh->zst_len : 0; zs->zst_len += zh->zst_len; diff = zs->zst_len - zfetch_block_cap; if (diff > 0) { zs->zst_ph_offset = zs->zst_ph_offset > diff ? zs->zst_ph_offset - diff : 0; zs->zst_len = zs->zst_len > diff ? zs->zst_len - diff : zs->zst_len; } zs->zst_direction = ZFETCH_BACKWARD; break; } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride < zs->zst_len) && (zs->zst_len != zs->zst_stride)) { /* strided forward access */ mutex_enter(&zs->zst_lock); if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= zs->zst_len) || (zs->zst_len == zs->zst_stride)) { mutex_exit(&zs->zst_lock); goto top; } zs->zst_offset += zs->zst_stride; zs->zst_direction = ZFETCH_FORWARD; break; } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride < zs->zst_len) && (zs->zst_len != zs->zst_stride)) { /* strided reverse access */ mutex_enter(&zs->zst_lock); if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= zs->zst_len) || (zs->zst_len == zs->zst_stride)) { mutex_exit(&zs->zst_lock); goto top; } zs->zst_offset = zs->zst_offset > zs->zst_stride ? zs->zst_offset - zs->zst_stride : 0; zs->zst_ph_offset = (zs->zst_ph_offset > (2 * zs->zst_stride)) ? (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0; zs->zst_direction = ZFETCH_BACKWARD; break; } } if (zs) { if (reset) { zstream_t *remove = zs; ZFETCHSTAT_BUMP(zfetchstat_stream_resets); rc = 0; mutex_exit(&zs->zst_lock); rw_exit(&zf->zf_rwlock); rw_enter(&zf->zf_rwlock, RW_WRITER); /* * Relocate the stream, in case someone removes * it while we were acquiring the WRITER lock. */ for (zs = list_head(&zf->zf_stream); zs; zs = list_next(&zf->zf_stream, zs)) { if (zs == remove) { dmu_zfetch_stream_remove(zf, zs); mutex_destroy(&zs->zst_lock); kmem_free(zs, sizeof (zstream_t)); break; } } } else { ZFETCHSTAT_BUMP(zfetchstat_stream_noresets); rc = 1; dmu_zfetch_dofetch(zf, zs); mutex_exit(&zs->zst_lock); } } out: rw_exit(&zf->zf_rwlock); return (rc); }