static inline void run_gen_bench_impl(const char *impl) { int fn, ncols; uint64_t ds, iter_cnt, iter, disksize; hrtime_t start; double elapsed, d_bw; /* Benchmark generate functions */ for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) { /* create suitable raidz_map */ ncols = rto_opts.rto_dcols + fn + 1; zio_bench.io_size = 1ULL << ds; rm_bench = vdev_raidz_map_alloc(&zio_bench, BENCH_ASHIFT, ncols, fn+1); /* estimate iteration count */ iter_cnt = GEN_BENCH_MEMORY; iter_cnt /= zio_bench.io_size; start = gethrtime(); for (iter = 0; iter < iter_cnt; iter++) vdev_raidz_generate_parity(rm_bench); elapsed = NSEC2SEC((double) (gethrtime() - start)); disksize = (1ULL << ds) / rto_opts.rto_dcols; d_bw = (double)iter_cnt * (double)disksize; d_bw /= (1024.0 * 1024.0 * elapsed); LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n", impl, raidz_gen_name[fn], rto_opts.rto_dcols, (1ULL<<ds), d_bw, d_bw * (double)(ncols), (unsigned) iter_cnt); vdev_raidz_map_free(rm_bench); } } }
static void run_rec_bench_impl(const char *impl) { int fn, ncols, nbad; uint64_t ds, iter_cnt, iter, disksize; hrtime_t start; double elapsed, d_bw; static const int tgt[7][3] = { {1, 2, 3}, /* rec_p: bad QR & D[0] */ {0, 2, 3}, /* rec_q: bad PR & D[0] */ {0, 1, 3}, /* rec_r: bad PQ & D[0] */ {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ }; for (fn = 0; fn < RAIDZ_REC_NUM; fn++) { for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) { /* create suitable raidz_map */ ncols = rto_opts.rto_dcols + PARITY_PQR; zio_bench.io_size = 1ULL << ds; /* * raidz block is too short to test * the requested method */ if (zio_bench.io_size / rto_opts.rto_dcols < (1ULL << BENCH_ASHIFT)) continue; rm_bench = vdev_raidz_map_alloc(&zio_bench, BENCH_ASHIFT, ncols, PARITY_PQR); /* estimate iteration count */ iter_cnt = (REC_BENCH_MEMORY); iter_cnt /= zio_bench.io_size; /* calculate how many bad columns there are */ nbad = MIN(3, raidz_ncols(rm_bench) - raidz_parity(rm_bench)); start = gethrtime(); for (iter = 0; iter < iter_cnt; iter++) vdev_raidz_reconstruct(rm_bench, tgt[fn], nbad); elapsed = NSEC2SEC((double) (gethrtime() - start)); disksize = (1ULL << ds) / rto_opts.rto_dcols; d_bw = (double)iter_cnt * (double)(disksize); d_bw /= (1024.0 * 1024.0 * elapsed); LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n", impl, raidz_rec_name[fn], rto_opts.rto_dcols, (1ULL<<ds), d_bw, d_bw * (double)ncols, (unsigned) iter_cnt); vdev_raidz_map_free(rm_bench); } } }
static void mmp_thread(void *arg) { spa_t *spa = (spa_t *)arg; mmp_thread_t *mmp = &spa->spa_mmp; boolean_t last_spa_suspended = spa_suspended(spa); boolean_t last_spa_multihost = spa_multihost(spa); callb_cpr_t cpr; hrtime_t max_fail_ns = zfs_multihost_fail_intervals * MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)); mmp_thread_enter(mmp, &cpr); /* * The mmp_write_done() function calculates mmp_delay based on the * prior value of mmp_delay and the elapsed time since the last write. * For the first mmp write, there is no "last write", so we start * with fake, but reasonable, default non-zero values. */ mmp->mmp_delay = MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)) / MAX(vdev_count_leaves(spa), 1); mmp->mmp_last_write = gethrtime() - mmp->mmp_delay; while (!mmp->mmp_thread_exiting) { uint64_t mmp_fail_intervals = zfs_multihost_fail_intervals; uint64_t mmp_interval = MSEC2NSEC( MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)); boolean_t suspended = spa_suspended(spa); boolean_t multihost = spa_multihost(spa); hrtime_t start, next_time; start = gethrtime(); if (multihost) { next_time = start + mmp_interval / MAX(vdev_count_leaves(spa), 1); } else { next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL); } /* * When MMP goes off => on, or spa goes suspended => * !suspended, we know no writes occurred recently. We * update mmp_last_write to give us some time to try. */ if ((!last_spa_multihost && multihost) || (last_spa_suspended && !suspended)) { mutex_enter(&mmp->mmp_io_lock); mmp->mmp_last_write = gethrtime(); mutex_exit(&mmp->mmp_io_lock); } else if (last_spa_multihost && !multihost) { mutex_enter(&mmp->mmp_io_lock); mmp->mmp_delay = 0; mutex_exit(&mmp->mmp_io_lock); } last_spa_multihost = multihost; last_spa_suspended = suspended; /* * Smooth max_fail_ns when its factors are decreased, because * making (max_fail_ns < mmp_interval) results in the pool being * immediately suspended before writes can occur at the new * higher frequency. */ if ((mmp_interval * mmp_fail_intervals) < max_fail_ns) { max_fail_ns = ((31 * max_fail_ns) + (mmp_interval * mmp_fail_intervals)) / 32; } else { max_fail_ns = mmp_interval * mmp_fail_intervals; } /* * Suspend the pool if no MMP write has succeeded in over * mmp_interval * mmp_fail_intervals nanoseconds. */ if (!suspended && mmp_fail_intervals && multihost && (start - mmp->mmp_last_write) > max_fail_ns) { cmn_err(CE_WARN, "MMP writes to pool '%s' have not " "succeeded in over %llus; suspending pool", spa_name(spa), NSEC2SEC(start - mmp->mmp_last_write)); zio_suspend(spa, NULL); } if (multihost) mmp_write_uberblock(spa); CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_sig(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock, ddi_get_lbolt() + ((next_time - gethrtime()) / (NANOSEC / hz))); CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock); } /* Outstanding writes are allowed to complete. */ if (mmp->mmp_zio_root) zio_wait(mmp->mmp_zio_root); mmp->mmp_zio_root = NULL; mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr); }