int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) { int err = 0; dmu_tx_t *tx; nvlist_t *nvarg; if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa)) return (SET_ERROR(EINVAL)); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } VERIFY0(nvlist_dup(nvl, &nvarg, KM_SLEEP)); if (spa_history_zone() != NULL) { fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE, spa_history_zone()); } fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); /* Kick this off asynchronously; errors are ignored. */ dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, nvarg, 0, ZFS_SPACE_CHECK_NONE, tx); dmu_tx_commit(tx); /* spa_history_log_sync will free nvl */ return (err); }
/* * The nvlist will be consumed by this call. */ static void log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, dmu_tx_t *tx, const char *fmt, va_list adx) { char *msg; /* * If this is part of creating a pool, not everything is * initialized yet, so don't bother logging the internal events. * Likewise if the pool is not writeable. */ if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) { fnvlist_free(nvl); return; } msg = kmem_vasprintf(fmt, adx); fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg); strfree(msg); fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation); fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); if (dmu_tx_is_syncing(tx)) { spa_history_log_sync(nvl, tx); } else { dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx); } /* spa_history_log_sync() will free nvl */ }
void dsl_sync_task_do_nowait(dsl_pool_t *dp, dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx) { dsl_sync_task_group_t *dstg; if (!spa_writeable(dp->dp_spa)) return; dstg = dsl_sync_task_group_create(dp); dsl_sync_task_create(dstg, checkfunc, syncfunc, arg1, arg2, blocks_modified); dsl_sync_task_group_nowait(dstg, tx); }
static int zvol_first_open(zvol_state_t *zv) { objset_t *os; uint64_t volsize; int error; uint64_t ro; /* lie and say we're read-only */ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os); if (error) return (SET_ERROR(-error)); zv->zv_objset = os; error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); if (error) goto out_owned; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_owned; error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); if (error) goto out_owned; set_capacity(zv->zv_disk, volsize >> 9); zv->zv_volsize = volsize; zv->zv_zilog = zil_open(os, zvol_get_data); if (ro || dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) { set_disk_ro(zv->zv_disk, 1); zv->zv_flags |= ZVOL_RDONLY; } else { set_disk_ro(zv->zv_disk, 0); zv->zv_flags &= ~ZVOL_RDONLY; } out_owned: if (error) { dmu_objset_disown(os, zvol_tag); zv->zv_objset = NULL; } return (SET_ERROR(-error)); }
void mmp_thread_start(spa_t *spa) { mmp_thread_t *mmp = &spa->spa_mmp; if (spa_writeable(spa)) { mutex_enter(&mmp->mmp_thread_lock); if (!mmp->mmp_thread) { dprintf("mmp_thread_start pool %s\n", spa->spa_name); mmp->mmp_thread = thread_create(NULL, 0, mmp_thread, spa, 0, &p0, TS_RUN, defclsyspri); } mutex_exit(&mmp->mmp_thread_lock); } }
int dsl_sync_task_do(dsl_pool_t *dp, dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, void *arg1, void *arg2, int blocks_modified) { dsl_sync_task_group_t *dstg; int err; ASSERT(spa_writeable(dp->dp_spa)); dstg = dsl_sync_task_group_create(dp); dsl_sync_task_create(dstg, checkfunc, syncfunc, arg1, arg2, blocks_modified); err = dsl_sync_task_group_wait(dstg); dsl_sync_task_group_destroy(dstg); return (err); }
static void traverse_zil(traverse_data_t *td, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zilog_t *zilog; /* * We only want to visit blocks that have been claimed but not yet * replayed; plus, in read-only mode, blocks that are already stable. */ if (claim_txg == 0 && spa_writeable(td->td_spa)) return; zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, claim_txg); zil_free(zilog); }
static void dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zil_scan_arg_t zsa = { dp, zh }; zilog_t *zilog; /* * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). */ if (claim_txg == 0 && spa_writeable(dp->dp_spa)) return; zilog = zil_alloc(dp->dp_meta_objset, zh); (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, claim_txg); zil_free(zilog); }
/* * The nvlist will be consumed by this call. */ static void log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, dmu_tx_t *tx, const char *fmt, va_list adx) { char *msg; va_list adx1; int size; /* * If this is part of creating a pool, not everything is * initialized yet, so don't bother logging the internal events. * Likewise if the pool is not writeable. */ if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) { fnvlist_free(nvl); return; } va_copy(adx1, adx); size = vsnprintf(NULL, 0, fmt, adx1) + 1; msg = kmem_alloc(size, KM_PUSHPAGE); va_end(adx1); va_copy(adx1, adx); (void) vsprintf(msg, fmt, adx1); va_end(adx1); fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg); kmem_free(msg, size); fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation); fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); if (dmu_tx_is_syncing(tx)) { spa_history_log_sync(nvl, tx); } else { dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, nvl, 0, tx); } /* spa_history_log_sync() will free nvl */ }
int zfs_sb_setup(zfs_sb_t *zsb, boolean_t mounting) { int error; error = zfs_register_callbacks(zsb); if (error) return (error); /* * Set the objset user_ptr to track its zsb. */ mutex_enter(&zsb->z_os->os_user_ptr_lock); dmu_objset_set_user(zsb->z_os, zsb); mutex_exit(&zsb->z_os->os_user_ptr_lock); zsb->z_log = zil_open(zsb->z_os, zfs_get_data); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { boolean_t readonly; /* * During replay we remove the read only flag to * allow replays to succeed. */ readonly = zfs_is_readonly(zsb); if (readonly != 0) readonly_changed_cb(zsb, B_FALSE); else zfs_unlinked_drain(zsb); /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zsb->z_os))) { if (zil_replay_disable) { zil_destroy(zsb->z_log, B_FALSE); } else { zsb->z_replay = B_TRUE; zil_replay(zsb->z_os, zsb, zfs_replay_vector); zsb->z_replay = B_FALSE; } } /* restore readonly bit */ if (readonly != 0) readonly_changed_cb(zsb, B_TRUE); } return (0); }
int zfs_register_callbacks(zfs_sb_t *zsb) { struct dsl_dataset *ds = NULL; objset_t *os = zsb->z_os; zfs_mntopts_t *zmo = zsb->z_mntopts; int error = 0; ASSERT(zsb); ASSERT(zmo); /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (zfs_is_readonly(zsb) || !spa_writeable(dmu_objset_spa(os))) { zmo->z_do_readonly = B_TRUE; zmo->z_readonly = B_TRUE; } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zsb); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (zmo->z_do_readonly) readonly_changed_cb(zsb, zmo->z_readonly); if (zmo->z_do_setuid) setuid_changed_cb(zsb, zmo->z_setuid); if (zmo->z_do_exec) exec_changed_cb(zsb, zmo->z_exec); if (zmo->z_do_devices) devices_changed_cb(zsb, zmo->z_devices); if (zmo->z_do_xattr) xattr_changed_cb(zsb, zmo->z_xattr); if (zmo->z_do_atime) atime_changed_cb(zsb, zmo->z_atime); if (zmo->z_do_relatime) relatime_changed_cb(zsb, zmo->z_relatime); if (zmo->z_do_nbmand) nbmand_changed_cb(zsb, zmo->z_nbmand); return (0); unregister: /* * We may attempt to unregister some callbacks that are not * registered, but this is OK; it will simply return ENOMSG, * which we will ignore. */ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zsb); return (error); }
static int vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *ashift) { spa_t *spa = vd->vdev_spa; vdev_disk_t *dvd; struct dk_minfo_ext dkmext; int error; dev_t dev; int otyp; boolean_t validate_devid = B_FALSE; ddi_devid_t devid; /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if (vd->vdev_tsd != NULL) { ASSERT(vd->vdev_reopening); dvd = vd->vdev_tsd; goto skip_open; } dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); /* * When opening a disk device, we want to preserve the user's original * intent. We always want to open the device by the path the user gave * us, even if it is one of multiple paths to the save device. But we * also want to be able to survive disks being removed/recabled. * Therefore the sequence of opening devices is: * * 1. Try opening the device by path. For legacy pools without the * 'whole_disk' property, attempt to fix the path by appending 's0'. * * 2. If the devid of the device matches the stored value, return * success. * * 3. Otherwise, the device may have moved. Try opening the device * by the devid instead. */ if (vd->vdev_devid != NULL) { if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, &dvd->vd_minor) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } } error = EINVAL; /* presume failure */ if (vd->vdev_path != NULL) { if (vd->vdev_wholedisk == -1ULL) { size_t len = strlen(vd->vdev_path) + 3; char *buf = kmem_alloc(len, KM_SLEEP); ldi_handle_t lh; (void) snprintf(buf, len, "%ss0", vd->vdev_path); if (ldi_open_by_name(buf, spa_mode(spa), kcred, &lh, zfs_li) == 0) { spa_strfree(vd->vdev_path); vd->vdev_path = buf; vd->vdev_wholedisk = 1ULL; (void) ldi_close(lh, spa_mode(spa), kcred); } else { kmem_free(buf, len); } } error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); /* * Compare the devid to the stored value. */ if (error == 0 && vd->vdev_devid != NULL && ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { error = SET_ERROR(EINVAL); (void) ldi_close(dvd->vd_lh, spa_mode(spa), kcred); dvd->vd_lh = NULL; } ddi_devid_free(devid); } /* * If we succeeded in opening the device, but 'vdev_wholedisk' * is not yet set, then this must be a slice. */ if (error == 0 && vd->vdev_wholedisk == -1ULL) vd->vdev_wholedisk = 0; } /* * If we were unable to open by path, or the devid check fails, open by * devid instead. */ if (error != 0 && vd->vdev_devid != NULL) { error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } /* * If all else fails, then try opening by physical path (if available) * or the logical path (if we failed due to the devid check). While not * as reliable as the devid, this will give us something, and the higher * level vdev validation will prevent us from opening the wrong device. */ if (error) { if (vd->vdev_devid != NULL) validate_devid = B_TRUE; if (vd->vdev_physpath != NULL && (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); /* * Note that we don't support the legacy auto-wholedisk support * as above. This hasn't been used in a very long time and we * don't need to propagate its oddities to this edge condition. */ if (error && vd->vdev_path != NULL) error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } /* * Now that the device has been successfully opened, update the devid * if necessary. */ if (validate_devid && spa_writeable(spa) && ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { char *vd_devid; vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor); zfs_dbgmsg("vdev %s: update devid from %s, " "to %s", vd->vdev_path, vd->vdev_devid, vd_devid); spa_strfree(vd->vdev_devid); vd->vdev_devid = spa_strdup(vd_devid); ddi_devid_str_free(vd_devid); } ddi_devid_free(devid); } /* * Once a device is opened, verify that the physical device path (if * available) is up to date. */ if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { char *physpath, *minorname; physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); minorname = NULL; if (ddi_dev_pathname(dev, otyp, physpath) == 0 && ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && (vd->vdev_physpath == NULL || strcmp(vd->vdev_physpath, physpath) != 0)) { if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); (void) strlcat(physpath, ":", MAXPATHLEN); (void) strlcat(physpath, minorname, MAXPATHLEN); vd->vdev_physpath = spa_strdup(physpath); } if (minorname) kmem_free(minorname, strlen(minorname) + 1); kmem_free(physpath, MAXPATHLEN); } skip_open: /* * Determine the actual size of the device. */ if (ldi_get_size(dvd->vd_lh, psize) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (SET_ERROR(EINVAL)); } /* * Determine the device's minimum transfer size. * If the ioctl isn't supported, assume DEV_BSIZE. */ if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)&dkmext, FKIOCTL, kcred, NULL) != 0) dkmext.dki_pbsize = DEV_BSIZE; *ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1; if (vd->vdev_wholedisk == 1) { uint64_t capacity = dkmext.dki_capacity - 1; uint64_t blksz = dkmext.dki_lbsize; int wce = 1; /* * If we own the whole disk, try to enable disk write caching. * We ignore errors because it's OK if we can't do it. */ (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, FKIOCTL, kcred, NULL); *max_psize = *psize + vdev_disk_get_space(vd, capacity, blksz); zfs_dbgmsg("capacity change: vdev %s, psize %llu, " "max_psize %llu", vd->vdev_path, *psize, *max_psize); } else { *max_psize = *psize; } /* * Clear the nowritecache bit, so that on a vdev_reopen() we will * try again. */ vd->vdev_nowritecache = B_FALSE; return (0); }
static int __zvol_create_minor(const char *name, boolean_t ignore_snapdev) { zvol_state_t *zv; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; unsigned minor = 0; int error = 0; ASSERT(MUTEX_HELD(&zvol_state_lock)); zv = zvol_find_by_name(name); if (zv) { error = EEXIST; goto out; } if (ignore_snapdev == B_FALSE) { error = __zvol_snapdev_hidden(name); if (error) goto out; } doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); if (error) goto out_doi; error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_dmu_objset_disown; error = zvol_find_minor(&minor); if (error) goto out_dmu_objset_disown; zv = zvol_alloc(MKDEV(zvol_major, minor), name); if (zv == NULL) { error = EAGAIN; goto out_dmu_objset_disown; } if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; zv->zv_volblocksize = doi->doi_data_block_size; zv->zv_volsize = volsize; zv->zv_objset = os; set_capacity(zv->zv_disk, zv->zv_volsize >> 9); blk_queue_max_hw_sectors(zv->zv_queue, UINT_MAX); blk_queue_max_segments(zv->zv_queue, UINT16_MAX); blk_queue_max_segment_size(zv->zv_queue, UINT_MAX); blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize); blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize); #ifdef HAVE_BLK_QUEUE_DISCARD blk_queue_max_discard_sectors(zv->zv_queue, (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue); #endif #ifdef HAVE_BLK_QUEUE_NONROT queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue); #endif if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) zil_destroy(dmu_objset_zil(os), B_FALSE); else zil_replay(os, zv, zvol_replay_vector); } zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, zvol_tag); out_doi: kmem_free(doi, sizeof(dmu_object_info_t)); out: if (error == 0) { zvol_insert(zv); add_disk(zv->zv_disk); } return (error); }
static int vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *ashift) { spa_t *spa = vd->vdev_spa; vdev_disk_t *dvd = vd->vdev_tsd; ldi_ev_cookie_t ecookie; vdev_disk_ldi_cb_t *lcb; union { struct dk_minfo_ext ude; struct dk_minfo ud; } dks; struct dk_minfo_ext *dkmext = &dks.ude; struct dk_minfo *dkm = &dks.ud; int error; /* XXX Apple - must leave devid unchanged */ #ifdef illumos dev_t dev; int otyp; boolean_t validate_devid = B_FALSE; ddi_devid_t devid; #endif uint64_t capacity = 0, blksz = 0, pbsize; #ifdef __APPLE__ int isssd; #endif /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if (dvd != NULL) { if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) { /* * If we are opening a device in its offline notify * context, the LDI handle was just closed. Clean * up the LDI event callbacks and free vd->vdev_tsd. */ vdev_disk_free(vd); } else { ASSERT(vd->vdev_reopening); goto skip_open; } } /* * Create vd->vdev_tsd. */ vdev_disk_alloc(vd); dvd = vd->vdev_tsd; /* * When opening a disk device, we want to preserve the user's original * intent. We always want to open the device by the path the user gave * us, even if it is one of multiple paths to the same device. But we * also want to be able to survive disks being removed/recabled. * Therefore the sequence of opening devices is: * * 1. Try opening the device by path. For legacy pools without the * 'whole_disk' property, attempt to fix the path by appending 's0'. * * 2. If the devid of the device matches the stored value, return * success. * * 3. Otherwise, the device may have moved. Try opening the device * by the devid instead. */ /* * XXX We must not set or modify the devid as this check would prevent * import on Solaris/illumos. */ #ifdef illumos if (vd->vdev_devid != NULL) { if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, &dvd->vd_minor) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; vdev_dbgmsg(vd, "vdev_disk_open: invalid " "vdev_devid '%s'", vd->vdev_devid); return (SET_ERROR(EINVAL)); } } #endif error = EINVAL; /* presume failure */ if (vd->vdev_path != NULL) { /* * XXX This assumes that if vdev_path refers to a device path /dev/dsk/cNtNdN, * then the whole disk can be found by slice 0 at path /dev/dsk/cNtNdNs0. */ #ifdef illumos if (vd->vdev_wholedisk == -1ULL) { size_t len = strlen(vd->vdev_path) + 3; char *buf = kmem_alloc(len, KM_SLEEP); (void) snprintf(buf, len, "%ss0", vd->vdev_path); error = ldi_open_by_name(buf, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); if (error == 0) { spa_strfree(vd->vdev_path); vd->vdev_path = buf; vd->vdev_wholedisk = 1ULL; } else { kmem_free(buf, len); } } #endif /* * If we have not yet opened the device, try to open it by the * specified path. */ if (error != 0) { error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } /* XXX Apple - must leave devid unchanged */ #ifdef illumos /* * Compare the devid to the stored value. */ if (error == 0 && vd->vdev_devid != NULL && ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { error = SET_ERROR(EINVAL); (void) ldi_close(dvd->vd_lh, spa_mode(spa), kcred); dvd->vd_lh = NULL; } ddi_devid_free(devid); } #endif /* * If we succeeded in opening the device, but 'vdev_wholedisk' * is not yet set, then this must be a slice. */ if (error == 0 && vd->vdev_wholedisk == -1ULL) vd->vdev_wholedisk = 0; } /* XXX Apple - must leave devid unchanged */ #ifdef illumos /* * If we were unable to open by path, or the devid check fails, open by * devid instead. */ if (error != 0 && vd->vdev_devid != NULL) { error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } #endif /* * If all else fails, then try opening by physical path (if available) * or the logical path (if we failed due to the devid check). While not * as reliable as the devid, this will give us something, and the higher * level vdev validation will prevent us from opening the wrong device. */ if (error) { /* XXX Apple - must leave devid unchanged */ #ifdef illumos if (vd->vdev_devid != NULL) validate_devid = B_TRUE; #endif /* XXX Apple to do - make ddi_ interface for this, using IORegistry path */ #ifdef illumos if (vd->vdev_physpath != NULL && (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); #endif /* * Note that we don't support the legacy auto-wholedisk support * as above. This hasn't been used in a very long time and we * don't need to propagate its oddities to this edge condition. */ if (error && vd->vdev_path != NULL) error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]", error); return (error); } /* * XXX Apple - We must not set or modify the devid. Import on Solaris/illumos * expects a valid devid and fails if it cannot be decoded. */ #ifdef illumos /* * Now that the device has been successfully opened, update the devid * if necessary. */ if (validate_devid && spa_writeable(spa) && ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { char *vd_devid; vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor); vdev_dbgmsg(vd, "vdev_disk_open: update devid from " "'%s' to '%s'", vd->vdev_devid, vd_devid); spa_strfree(vd->vdev_devid); vd->vdev_devid = spa_strdup(vd_devid); ddi_devid_str_free(vd_devid); } ddi_devid_free(devid); } #endif /* XXX Apple to do, needs IORegistry physpath interface */ #ifdef illumos /* * Once a device is opened, verify that the physical device path (if * available) is up to date. */ if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { char *physpath, *minorname; physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); minorname = NULL; if (ddi_dev_pathname(dev, otyp, physpath) == 0 && ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && (vd->vdev_physpath == NULL || strcmp(vd->vdev_physpath, physpath) != 0)) { if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); (void) strlcat(physpath, ":", MAXPATHLEN); (void) strlcat(physpath, minorname, MAXPATHLEN); vd->vdev_physpath = spa_strdup(physpath); } if (minorname) kmem_free(minorname, strlen(minorname) + 1); kmem_free(physpath, MAXPATHLEN); } #endif /* * Register callbacks for the LDI offline event. */ if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) == LDI_EV_SUCCESS) { lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); list_insert_tail(&dvd->vd_ldi_cbs, lcb); (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id); } /* XXX Apple to do - we could support the degrade event, or just no-op */ #ifdef illumos /* * Register callbacks for the LDI degrade event. */ if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) == LDI_EV_SUCCESS) { lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); list_insert_tail(&dvd->vd_ldi_cbs, lcb); (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id); } #endif #if 0 int len = MAXPATHLEN; if (vn_getpath(devvp, dvd->vd_readlinkname, &len) == 0) { dprintf("ZFS: '%s' resolved name is '%s'\n", vd->vdev_path, dvd->vd_readlinkname); } else { dvd->vd_readlinkname[0] = 0; } #endif skip_open: /* * Determine the actual size of the device. */ if (ldi_get_size(dvd->vd_lh, psize) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; vdev_dbgmsg(vd, "vdev_disk_open: failed to get size"); return (SET_ERROR(EINVAL)); } *max_psize = *psize; /* * Determine the device's minimum transfer size. * If the ioctl isn't supported, assume DEV_BSIZE. */ if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) { capacity = dkmext->dki_capacity - 1; blksz = dkmext->dki_lbsize; pbsize = dkmext->dki_pbsize; } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) { VDEV_DEBUG( "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n", vd->vdev_path); capacity = dkm->dki_capacity - 1; blksz = dkm->dki_lbsize; pbsize = blksz; } else { VDEV_DEBUG("vdev_disk_open(\"%s\"): " "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n", vd->vdev_path, error); pbsize = DEV_BSIZE; } *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; /* XXX Now that we opened the device, determine if it is a whole disk. */ #ifdef __APPLE__ /* * XXX Apple to do - provide an ldi_ mechanism * to report whether this is a whole disk or a * partition. * Return 0 (no), 1 (yes), or -1 (error). */ // vd->vdev_wholedisk = ldi_is_wholedisk(vd->vd_lh); #endif if (vd->vdev_wholedisk == 1) { int wce = 1; /* Gets information about the disk if it has GPT partitions */ #ifdef illumos if (error == 0) { /* * If we have the capability to expand, we'd have * found out via success from DKIOCGMEDIAINFO{,EXT}. * Adjust max_psize upward accordingly since we know * we own the whole disk now. */ *max_psize = capacity * blksz; } #endif /* * Since we own the whole disk, try to enable disk write * caching. We ignore errors because it's OK if we can't do it. */ (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, FKIOCTL, kcred, NULL); } /* * Clear the nowritecache bit, so that on a vdev_reopen() we will * try again. */ vd->vdev_nowritecache = B_FALSE; #ifdef __APPLE__ /* Inform the ZIO pipeline that we are non-rotational */ vd->vdev_nonrot = B_FALSE; if (ldi_ioctl(dvd->vd_lh, DKIOCISSOLIDSTATE, (intptr_t)&isssd, FKIOCTL, kcred, NULL) == 0) { vd->vdev_nonrot = (isssd ? B_TRUE : B_FALSE); } #endif //__APPLE__ return (0); }
static int vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *ashift) { spa_t *spa = vd->vdev_spa; vdev_disk_t *dvd; ldi_ev_cookie_t ecookie; vdev_disk_ldi_cb_t *lcb; union { struct dk_minfo_ext ude; struct dk_minfo ud; } dks; struct dk_minfo_ext *dkmext = &dks.ude; struct dk_minfo *dkm = &dks.ud; int error; dev_t dev; int otyp; boolean_t validate_devid = B_FALSE; ddi_devid_t devid; uint64_t capacity = 0, blksz = 0, pbsize; /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } rw_enter(&vd->vdev_tsd_lock, RW_WRITER); dvd = vd->vdev_tsd; /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if (dvd != NULL) { ASSERT(vd->vdev_reopening); /* * Here vd_lh is protected by vdev_tsd_lock */ ASSERT(dvd->vd_lh != NULL); /* This should not happen, but let's be safe */ if (dvd->vd_lh == NULL) { /* What are we going to do here??? */ rw_exit(&vd->vdev_tsd_lock); return (SET_ERROR(ENXIO)); } goto skip_open; } /* * Create dvd to be used as vd->vdev_tsd. */ vd->vdev_tsd = dvd = vdev_disk_alloc(); /* * When opening a disk device, we want to preserve the user's original * intent. We always want to open the device by the path the user gave * us, even if it is one of multiple paths to the same device. But we * also want to be able to survive disks being removed/recabled. * Therefore the sequence of opening devices is: * * 1. Try opening the device by path. For legacy pools without the * 'whole_disk' property, attempt to fix the path by appending 's0'. * * 2. If the devid of the device matches the stored value, return * success. * * 3. Otherwise, the device may have moved. Try opening the device * by the devid instead. */ if (vd->vdev_devid != NULL) { if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, &dvd->vd_minor) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; vdev_disk_free_locked(vd); rw_exit(&vd->vdev_tsd_lock); return (SET_ERROR(EINVAL)); } } error = EINVAL; /* presume failure */ if (vd->vdev_path != NULL) { if (vd->vdev_wholedisk == -1ULL) { size_t len = strlen(vd->vdev_path) + 3; char *buf = kmem_alloc(len, KM_SLEEP); (void) snprintf(buf, len, "%ss0", vd->vdev_path); error = ldi_open_by_name(buf, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); if (error == 0) { spa_strfree(vd->vdev_path); vd->vdev_path = buf; vd->vdev_wholedisk = 1ULL; } else { kmem_free(buf, len); } } /* * If we have not yet opened the device, try to open it by the * specified path. */ if (error != 0) { error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } /* * Compare the devid to the stored value. */ if (error == 0 && vd->vdev_devid != NULL && ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { error = SET_ERROR(EINVAL); (void) ldi_close(dvd->vd_lh, spa_mode(spa), kcred); dvd->vd_lh = NULL; } ddi_devid_free(devid); } /* * If we succeeded in opening the device, but 'vdev_wholedisk' * is not yet set, then this must be a slice. */ if (error == 0 && vd->vdev_wholedisk == -1ULL) vd->vdev_wholedisk = 0; } /* * If we were unable to open by path, or the devid check fails, open by * devid instead. */ if (error != 0 && vd->vdev_devid != NULL) { error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } /* * If all else fails, then try opening by physical path (if available) * or the logical path (if we failed due to the devid check). While not * as reliable as the devid, this will give us something, and the higher * level vdev validation will prevent us from opening the wrong device. */ if (error) { if (vd->vdev_devid != NULL) validate_devid = B_TRUE; if (vd->vdev_physpath != NULL && (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); /* * Note that we don't support the legacy auto-wholedisk support * as above. This hasn't been used in a very long time and we * don't need to propagate its oddities to this edge condition. */ if (error && vd->vdev_path != NULL) error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; vdev_disk_free_locked(vd); rw_exit(&vd->vdev_tsd_lock); return (error); } /* * Now that the device has been successfully opened, update the devid * if necessary. */ if (validate_devid && spa_writeable(spa) && ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { char *vd_devid; vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor); zfs_dbgmsg("vdev %s: update devid from %s, " "to %s", vd->vdev_path, vd->vdev_devid, vd_devid); spa_strfree(vd->vdev_devid); vd->vdev_devid = spa_strdup(vd_devid); ddi_devid_str_free(vd_devid); } ddi_devid_free(devid); } /* * Once a device is opened, verify that the physical device path (if * available) is up to date. */ if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { char *physpath, *minorname; physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); minorname = NULL; if (ddi_dev_pathname(dev, otyp, physpath) == 0 && ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && (vd->vdev_physpath == NULL || strcmp(vd->vdev_physpath, physpath) != 0)) { if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); (void) strlcat(physpath, ":", MAXPATHLEN); (void) strlcat(physpath, minorname, MAXPATHLEN); vd->vdev_physpath = spa_strdup(physpath); } if (minorname) kmem_free(minorname, strlen(minorname) + 1); kmem_free(physpath, MAXPATHLEN); } /* * Register callbacks for the LDI offline event. */ if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) == LDI_EV_SUCCESS) { lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); list_insert_tail(&dvd->vd_ldi_cbs, lcb); (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id); } /* * Register callbacks for the LDI degrade event. */ if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) == LDI_EV_SUCCESS) { lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); list_insert_tail(&dvd->vd_ldi_cbs, lcb); (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id); } /* Reset TRIM flag, as underlying device support may have changed */ vd->vdev_notrim = B_FALSE; skip_open: ASSERT(dvd != NULL); /* * Determine the actual size of the device. */ if (ldi_get_size(dvd->vd_lh, psize) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; vdev_disk_free_locked(vd); rw_exit(&vd->vdev_tsd_lock); return (SET_ERROR(EINVAL)); } *max_psize = *psize; /* * Determine the device's minimum transfer size. * If the ioctl isn't supported, assume DEV_BSIZE. */ if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) { capacity = dkmext->dki_capacity - 1; blksz = dkmext->dki_lbsize; pbsize = dkmext->dki_pbsize; } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) { VDEV_DEBUG( "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n", vd->vdev_path); capacity = dkm->dki_capacity - 1; blksz = dkm->dki_lbsize; pbsize = blksz; } else { VDEV_DEBUG("vdev_disk_open(\"%s\"): " "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n", vd->vdev_path, error); pbsize = DEV_BSIZE; } *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; if (vd->vdev_wholedisk == 1) { int wce = 1; if (error == 0) { /* * If we have the capability to expand, we'd have * found out via success from DKIOCGMEDIAINFO{,EXT}. * Adjust max_psize upward accordingly since we know * we own the whole disk now. */ *max_psize += vdev_disk_get_space(vd, capacity, blksz); zfs_dbgmsg("capacity change: vdev %s, psize %llu, " "max_psize %llu", vd->vdev_path, *psize, *max_psize); } /* * Since we own the whole disk, try to enable disk write * caching. We ignore errors because it's OK if we can't do it. */ (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, FKIOCTL, kcred, NULL); } /* * We are done with vd_lh and vdev_tsd, release the vdev_tsd_lock */ rw_exit(&vd->vdev_tsd_lock); /* * Clear the nowritecache bit, so that on a vdev_reopen() we will * try again. */ vd->vdev_nowritecache = B_FALSE; return (0); }
/* * Synchronize pool configuration to disk. This must be called with the * namespace lock held. Synchronizing the pool cache is typically done after * the configuration has been synced to the MOS. This exposes a window where * the MOS config will have been updated but the cache file has not. If * the system were to crash at that instant then the cached config may not * contain the correct information to open the pool and an explicity import * would be required. */ void spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; char *pool_name; ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (rootdir == NULL || !(spa_mode_global & FWRITE)) return; /* * Iterate over all cachefiles for the pool, past or present. When the * cachefile is changed, the new one is pushed onto this list, allowing * us to update previous cachefiles that no longer contain this pool. */ for (dp = list_head(&target->spa_config_list); dp != NULL; dp = list_next(&target->spa_config_list, dp)) { spa_t *spa = NULL; if (dp->scd_path == NULL) continue; /* * Iterate over all pools, adding any matching pools to 'nvl'. */ nvl = NULL; while ((spa = spa_next(spa)) != NULL) { /* * Skip over our own pool if we're about to remove * ourselves from the spa namespace or any pool that * is readonly. Since we cannot guarantee that a * readonly pool would successfully import upon reboot, * we don't allow them to be written to the cache file. */ if ((spa == target && removing) || !spa_writeable(spa)) continue; mutex_enter(&spa->spa_props_lock); tdp = list_head(&spa->spa_config_list); if (spa->spa_config == NULL || tdp->scd_path == NULL || strcmp(tdp->scd_path, dp->scd_path) != 0) { mutex_exit(&spa->spa_props_lock); continue; } if (nvl == NULL) VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { VERIFY0(nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME, &pool_name)); } else pool_name = spa_name(spa); VERIFY(nvlist_add_nvlist(nvl, pool_name, spa->spa_config) == 0); mutex_exit(&spa->spa_props_lock); } spa_config_write(dp, nvl); nvlist_free(nvl); } /* * Remove any config entries older than the current one. */ dp = list_head(&target->spa_config_list); while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { list_remove(&target->spa_config_list, tdp); if (tdp->scd_path != NULL) spa_strfree(tdp->scd_path); kmem_free(tdp, sizeof (spa_config_dirent_t)); } spa_config_generation++; if (postsysevent) spa_event_notify(target, NULL, FM_EREPORT_ZFS_CONFIG_SYNC); }
/* * Synchronize pool configuration to disk. This must be called with the * namespace lock held. Synchronizing the pool cache is typically done after * the configuration has been synced to the MOS. This exposes a window where * the MOS config will have been updated but the cache file has not. If * the system were to crash at that instant then the cached config may not * contain the correct information to open the pool and an explicity import * would be required. */ void spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; boolean_t ccw_failure; int error; ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (rootdir == NULL || !(spa_mode_global & FWRITE)) return; /* * Iterate over all cachefiles for the pool, past or present. When the * cachefile is changed, the new one is pushed onto this list, allowing * us to update previous cachefiles that no longer contain this pool. */ ccw_failure = B_FALSE; for (dp = list_head(&target->spa_config_list); dp != NULL; dp = list_next(&target->spa_config_list, dp)) { spa_t *spa = NULL; if (dp->scd_path == NULL) continue; /* * Iterate over all pools, adding any matching pools to 'nvl'. */ nvl = NULL; while ((spa = spa_next(spa)) != NULL) { nvlist_t *nvroot = NULL; /* * Skip over our own pool if we're about to remove * ourselves from the spa namespace or any pool that * is readonly. Since we cannot guarantee that a * readonly pool would successfully import upon reboot, * we don't allow them to be written to the cache file. */ if ((spa == target && removing) || (spa_state(spa) == POOL_STATE_ACTIVE && !spa_writeable(spa))) continue; mutex_enter(&spa->spa_props_lock); tdp = list_head(&spa->spa_config_list); if (spa->spa_config == NULL || tdp->scd_path == NULL || strcmp(tdp->scd_path, dp->scd_path) != 0) { mutex_exit(&spa->spa_props_lock); continue; } if (nvl == NULL) VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist(nvl, spa->spa_name, spa->spa_config) == 0); mutex_exit(&spa->spa_props_lock); if (nvlist_lookup_nvlist(nvl, spa->spa_name, &nvroot) == 0) spa_config_clean(nvroot); } error = spa_config_write(dp, nvl); if (error != 0) ccw_failure = B_TRUE; nvlist_free(nvl); } if (ccw_failure) { /* * Keep trying so that configuration data is * written if/when any temporary filesystem * resource issues are resolved. */ if (target->spa_ccw_fail_time == 0) { zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, target, NULL, NULL, 0, 0); } target->spa_ccw_fail_time = gethrtime(); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); } else { /* * Do not rate limit future attempts to update * the config cache. */ target->spa_ccw_fail_time = 0; } /* * Remove any config entries older than the current one. */ dp = list_head(&target->spa_config_list); while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { list_remove(&target->spa_config_list, tdp); if (tdp->scd_path != NULL) spa_strfree(tdp->scd_path); kmem_free(tdp, sizeof (spa_config_dirent_t)); } spa_config_generation++; if (postsysevent) spa_event_notify(target, NULL, ESC_ZFS_CONFIG_SYNC); }
static int zvol_first_open(zvol_state_t *zv) { objset_t *os; uint64_t volsize; int locked = 0; int error; uint64_t ro; /* * In all other cases the spa_namespace_lock is taken before the * bdev->bd_mutex lock. But in this case the Linux __blkdev_get() * function calls fops->open() with the bdev->bd_mutex lock held. * * To avoid a potential lock inversion deadlock we preemptively * try to take the spa_namespace_lock(). Normally it will not * be contended and this is safe because spa_open_common() handles * the case where the caller already holds the spa_namespace_lock. * * When it is contended we risk a lock inversion if we were to * block waiting for the lock. Luckily, the __blkdev_get() * function allows us to return -ERESTARTSYS which will result in * bdev->bd_mutex being dropped, reacquired, and fops->open() being * called again. This process can be repeated safely until both * locks are acquired. */ if (!mutex_owned(&spa_namespace_lock)) { locked = mutex_tryenter(&spa_namespace_lock); if (!locked) return (-ERESTARTSYS); } /* lie and say we're read-only */ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os); if (error) goto out_mutex; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) { dmu_objset_disown(os, zvol_tag); goto out_mutex; } zv->zv_objset = os; error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); if (error) { dmu_objset_disown(os, zvol_tag); goto out_mutex; } set_capacity(zv->zv_disk, volsize >> 9); zv->zv_volsize = volsize; zv->zv_zilog = zil_open(os, zvol_get_data); VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL) == 0); if (ro || dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) { set_disk_ro(zv->zv_disk, 1); zv->zv_flags |= ZVOL_RDONLY; } else { set_disk_ro(zv->zv_disk, 0); zv->zv_flags &= ~ZVOL_RDONLY; } out_mutex: if (locked) mutex_exit(&spa_namespace_lock); return (-error); }
/* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block * device is live and ready for use. */ static int zvol_create_minor_impl(const char *name) { zvol_state_t *zv; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; uint64_t len; unsigned minor = 0; int error = 0; mutex_enter(&zvol_state_lock); zv = zvol_find_by_name(name); if (zv) { error = SET_ERROR(EEXIST); goto out; } doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); if (error) goto out_doi; error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_dmu_objset_disown; error = zvol_find_minor(&minor); if (error) goto out_dmu_objset_disown; zv = zvol_alloc(MKDEV(zvol_major, minor), name); if (zv == NULL) { error = SET_ERROR(EAGAIN); goto out_dmu_objset_disown; } if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; zv->zv_volblocksize = doi->doi_data_block_size; zv->zv_volsize = volsize; zv->zv_objset = os; set_capacity(zv->zv_disk, zv->zv_volsize >> 9); blk_queue_max_hw_sectors(zv->zv_queue, (DMU_MAX_ACCESS / 4) >> 9); blk_queue_max_segments(zv->zv_queue, UINT16_MAX); blk_queue_max_segment_size(zv->zv_queue, UINT_MAX); blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize); blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize); blk_queue_max_discard_sectors(zv->zv_queue, (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue); #ifdef QUEUE_FLAG_NONROT queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue); #endif #ifdef QUEUE_FLAG_ADD_RANDOM queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue); #endif if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) zil_destroy(dmu_objset_zil(os), B_FALSE); else zil_replay(os, zv, zvol_replay_vector); } /* * When udev detects the addition of the device it will immediately * invoke blkid(8) to determine the type of content on the device. * Prefetching the blocks commonly scanned by blkid(8) will speed * up this process. */ len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); if (len > 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, ZIO_PRIORITY_SYNC_READ); } zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, zvol_tag); out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); out: if (error == 0) { zvol_insert(zv); /* * Drop the lock to prevent deadlock with sys_open() -> * zvol_open(), which first takes bd_disk->bd_mutex and then * takes zvol_state_lock, whereas this code path first takes * zvol_state_lock, and then takes bd_disk->bd_mutex. */ mutex_exit(&zvol_state_lock); add_disk(zv->zv_disk); } else { mutex_exit(&zvol_state_lock); } return (SET_ERROR(error)); }
int zfs_register_callbacks(zfs_sb_t *zsb) { struct dsl_dataset *ds = NULL; objset_t *os = zsb->z_os; boolean_t do_readonly = B_FALSE; int error = 0; if (zfs_is_readonly(zsb) || !spa_writeable(dmu_objset_spa(os))) do_readonly = B_TRUE; /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zsb); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; if (do_readonly) readonly_changed_cb(zsb, B_TRUE); return (0); unregister: /* * We may attempt to unregister some callbacks that are not * registered, but this is OK; it will simply return ENOMSG, * which we will ignore. */ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zsb); (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zsb); return (error); }
/* * Read out the command history. */ int spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) { objset_t *mos = spa->spa_meta_objset; dmu_buf_t *dbp; uint64_t read_len, phys_read_off, phys_eof; uint64_t leftover = 0; spa_history_phys_t *shpp; int err; /* * If the command history doesn't exist (older pool), * that's ok, just return ENOENT. */ if (!spa->spa_history) return (ENOENT); /* * The history is logged asynchronously, so when they request * the first chunk of history, make sure everything has been * synced to disk so that we get it. */ if (*offp == 0 && spa_writeable(spa)) txg_wait_synced(spa_get_dsl(spa), 0); if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) return (err); shpp = dbp->db_data; #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbp, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); } #endif mutex_enter(&spa->spa_history_lock); phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); if (*offp < shpp->sh_pool_create_len) { /* read in just the zpool create history */ phys_read_off = *offp; read_len = MIN(*len, shpp->sh_pool_create_len - phys_read_off); } else { /* * Need to reset passed in offset to BOF if the passed in * offset has since been overwritten. */ *offp = MAX(*offp, shpp->sh_bof); phys_read_off = spa_history_log_to_phys(*offp, shpp); /* * Read up to the minimum of what the user passed down or * the EOF (physical or logical). If we hit physical EOF, * use 'leftover' to read from the physical BOF. */ if (phys_read_off <= phys_eof) { read_len = MIN(*len, phys_eof - phys_read_off); } else { read_len = MIN(*len, shpp->sh_phys_max_off - phys_read_off); if (phys_read_off + *len > shpp->sh_phys_max_off) { leftover = MIN(*len - read_len, phys_eof - shpp->sh_pool_create_len); } } } /* offset for consumer to use next */ *offp += read_len + leftover; /* tell the consumer how much you actually read */ *len = read_len + leftover; if (read_len == 0) { mutex_exit(&spa->spa_history_lock); dmu_buf_rele(dbp, FTAG); return (0); } err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, DMU_READ_PREFETCH); if (leftover && err == 0) { err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, leftover, buf + read_len, DMU_READ_PREFETCH); } mutex_exit(&spa->spa_history_lock); dmu_buf_rele(dbp, FTAG); return (err); }
int zfs_register_callbacks(zfs_sb_t *zsb) { struct dsl_dataset *ds = NULL; objset_t *os = zsb->z_os; int error = 0; if (zfs_is_readonly(zsb) || !spa_writeable(dmu_objset_spa(os))) readonly_changed_cb(zsb, B_TRUE); /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ ds = dmu_objset_ds(os); error = dsl_prop_register(ds, "atime", atime_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, "xattr", xattr_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, "recordsize", blksz_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, "readonly", readonly_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, "devices", devices_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, "setuid", setuid_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, "exec", exec_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, "snapdir", snapdir_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, "aclinherit", acl_inherit_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, "vscan", vscan_changed_cb, zsb); error = error ? error : dsl_prop_register(ds, "nbmand", nbmand_changed_cb, zsb); if (error) goto unregister; return (0); unregister: /* * We may attempt to unregister some callbacks that are not * registered, but this is OK; it will simply return ENOMSG, * which we will ignore. */ (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zsb); (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zsb); (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zsb); (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zsb); (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zsb); (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zsb); (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zsb); (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zsb); (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zsb); (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zsb); (void) dsl_prop_unregister(ds, "nbmand", nbmand_changed_cb, zsb); return (error); }