static mirror_map_t * vdev_mirror_map_alloc(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; int c, d; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; c = BP_GET_NDVAS(zio->io_bp); mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); mm->mm_children = c; mm->mm_replacing = B_FALSE; mm->mm_preferred = spa_get_random(c); mm->mm_root = B_TRUE; /* * Check the other, lower-index DVAs to see if they're on * the same vdev as the child we picked. If they are, use * them since they are likely to have been allocated from * the primary metaslab in use at the time, and hence are * more likely to have locality with single-copy data. */ for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) mm->mm_preferred = d; } for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { c = vd->vdev_children; mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); mm->mm_children = c; mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); mm->mm_preferred = mm->mm_replacing ? 0 : (zio->io_offset >> vdev_mirror_shift) % c; mm->mm_root = B_FALSE; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; } } zio->io_vsd = mm; zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); }
static mirror_map_t * vdev_mirror_map_init(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; int c; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE, B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { mm = vdev_mirror_map_alloc(vd->vdev_children, (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops), B_FALSE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; } } zio->io_vsd = mm; zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); }
static int zil_dva_compare(const void *x1, const void *x2) { const dva_t *dva1 = x1; const dva_t *dva2 = x2; if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) return (-1); if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) return (1); if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) return (-1); if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) return (1); return (0); }
/* * Set the external verifier for a gang block based on <vdev, offset, txg>, * a tuple which is guaranteed to be unique for the life of the pool. */ static void zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp) { dva_t *dva = BP_IDENTITY(bp); uint64_t txg = BP_PHYSICAL_BIRTH(bp); ASSERT(BP_IS_GANG(bp)); ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); }
/* * Avoid inlining the function to keep vdev_mirror_io_start(), which * is this functions only caller, as small as possible on the stack. */ noinline static mirror_map_t * vdev_mirror_map_alloc(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; int c, d; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; c = BP_GET_NDVAS(zio->io_bp); mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE); mm->mm_children = c; mm->mm_replacing = B_FALSE; mm->mm_preferred = spa_get_random(c); mm->mm_root = B_TRUE; /* * Check the other, lower-index DVAs to see if they're on * the same vdev as the child we picked. If they are, use * them since they are likely to have been allocated from * the primary metaslab in use at the time, and hence are * more likely to have locality with single-copy data. */ for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) mm->mm_preferred = d; } for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { int lowest_pending = INT_MAX; int lowest_nr = 1; c = vd->vdev_children; mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE); mm->mm_children = c; mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); mm->mm_preferred = 0; mm->mm_root = B_FALSE; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; if (mm->mm_replacing) continue; if (!vdev_readable(mc->mc_vd)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; mc->mc_skipped = 1; mc->mc_pending = INT_MAX; continue; } mc->mc_pending = vdev_mirror_pending(mc->mc_vd); if (mc->mc_pending < lowest_pending) { lowest_pending = mc->mc_pending; lowest_nr = 1; } else if (mc->mc_pending == lowest_pending) { lowest_nr++; } } d = gethrtime() / (NSEC_PER_USEC * zfs_vdev_mirror_switch_us); d = (d % lowest_nr) + 1; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; if (mm->mm_child[c].mc_pending == lowest_pending) { if (--d == 0) { mm->mm_preferred = c; break; } } } } zio->io_vsd = mm; zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); }
static mirror_map_t * vdev_mirror_map_init(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; int c; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE, B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { /* * If we are resilvering, then we should handle scrub reads * differently; we shouldn't issue them to the resilvering * device because it might not have those blocks. * * We are resilvering iff: * 1) We are a replacing vdev (ie our name is "replacing-1" or * "spare-1" or something like that), and * 2) The pool is currently being resilvered. * * We cannot simply check vd->vdev_resilver_txg, because it's * not set in this path. * * Nor can we just check our vdev_ops; there are cases (such as * when a user types "zpool replace pool odev spare_dev" and * spare_dev is in the spare list, or when a spare device is * automatically used to replace a DEGRADED device) when * resilvering is complete but both the original vdev and the * spare vdev remain in the pool. That behavior is intentional. * It helps implement the policy that a spare should be * automatically removed from the pool after the user replaces * the device that originally failed. * * If a spa load is in progress, then spa_dsl_pool may be * uninitialized. But we shouldn't be resilvering during a spa * load anyway. */ boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops) && spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE && dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool); mm = vdev_mirror_map_alloc(vd->vdev_children, replacing, B_FALSE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; } } zio->io_vsd = mm; zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); }
static mirror_map_t * vdev_mirror_map_alloc(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; int c, d; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; c = BP_GET_NDVAS(zio->io_bp); mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); mm->mm_children = c; mm->mm_resilvering = B_FALSE; mm->mm_preferred = spa_get_random(c); mm->mm_root = B_TRUE; /* * Check the other, lower-index DVAs to see if they're on * the same vdev as the child we picked. If they are, use * them since they are likely to have been allocated from * the primary metaslab in use at the time, and hence are * more likely to have locality with single-copy data. */ for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) mm->mm_preferred = d; } for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { int replacing; c = vd->vdev_children; mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); mm->mm_children = c; /* * If we are resilvering, then we should handle scrub reads * differently; we shouldn't issue them to the resilvering * device because it might not have those blocks. * * We are resilvering iff: * 1) We are a replacing vdev (ie our name is "replacing-1" or * "spare-1" or something like that), and * 2) The pool is currently being resilvered. * * We cannot simply check vd->vdev_resilver_txg, because it's * not set in this path. * * Nor can we just check our vdev_ops; there are cases (such as * when a user types "zpool replace pool odev spare_dev" and * spare_dev is in the spare list, or when a spare device is * automatically used to replace a DEGRADED device) when * resilvering is complete but both the original vdev and the * spare vdev remain in the pool. That behavior is intentional. * It helps implement the policy that a spare should be * automatically removed from the pool after the user replaces * the device that originally failed. */ replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); /* * If a spa load is in progress, then spa_dsl_pool may be * uninitialized. But we shouldn't be resilvering during a spa * load anyway. */ if (replacing && (spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE) && dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool)) { mm->mm_resilvering = B_TRUE; } else { mm->mm_resilvering = B_FALSE; } mm->mm_preferred = mm->mm_resilvering ? 0 : (zio->io_offset >> vdev_mirror_shift) % c; mm->mm_root = B_FALSE; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; } } zio->io_vsd = mm; zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); }