static void dsl_dataset_user_release_onexit(void *arg) { zfs_hold_cleanup_arg_t *ca = arg; spa_t *spa; int error; error = spa_open(ca->zhca_spaname, &spa, FTAG); if (error != 0) { zfs_dbgmsg("couldn't release holds on pool=%s " "because pool is no longer loaded", ca->zhca_spaname); return; } if (spa_load_guid(spa) != ca->zhca_spa_load_guid) { zfs_dbgmsg("couldn't release holds on pool=%s " "because pool is no longer loaded (guid doesn't match)", ca->zhca_spaname); spa_close(spa, FTAG); return; } (void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds); fnvlist_free(ca->zhca_holds); kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); spa_close(spa, FTAG); }
int dsl_scan_init(dsl_pool_t *dp, uint64_t txg) { int err; dsl_scan_t *scn; spa_t *spa = dp->dp_spa; uint64_t f; scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); scn->scn_dp = dp; err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_func", sizeof (uint64_t), 1, &f); if (err == 0) { /* * There was an old-style scrub in progress. Restart a * new-style scrub from the beginning. */ scn->scn_restart_txg = txg; zfs_dbgmsg("old-style scrub was in progress; " "restarting new-style scrub in txg %llu", scn->scn_restart_txg); /* * Load the queue obj from the old location so that it * can be freed by dsl_scan_done(). */ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_queue", sizeof (uint64_t), 1, &scn->scn_phys.scn_queue_obj); } else { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys); if (err == ENOENT) return (0); else if (err) return (err); if (scn->scn_phys.scn_state == DSS_SCANNING && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { /* * A new-type scrub was in progress on an old * pool, and the pool was accessed by old * software. Restart from the beginning, since * the old software may have changed the pool in * the meantime. */ scn->scn_restart_txg = txg; zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", scn->scn_restart_txg); } } spa_scan_stat_init(spa); return (0); }
/* * Convert the logical range into a physcial range and add it to our * avl tree. */ void vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; range_seg_t logical_rs, physical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_xlate(vd, &logical_rs, &physical_rs); IMPLY(vd->vdev_top == vd, logical_rs.rs_start == physical_rs.rs_start); IMPLY(vd->vdev_top == vd, logical_rs.rs_end == physical_rs.rs_end); /* Only add segments that we have not visited yet */ if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) return; /* Pick up where we left off mid-range. */ if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " "(%llu, %llu)", vd->vdev_path, (u_longlong_t)physical_rs.rs_start, (u_longlong_t)physical_rs.rs_end, (u_longlong_t)vd->vdev_initialize_last_offset, (u_longlong_t)physical_rs.rs_end); ASSERT3U(physical_rs.rs_end, >, vd->vdev_initialize_last_offset); physical_rs.rs_start = vd->vdev_initialize_last_offset; }
static uint64_t vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz) { ASSERT(vd->vdev_wholedisk); vdev_disk_t *dvd = vd->vdev_tsd; dk_efi_t dk_ioc; efi_gpt_t *efi; uint64_t avail_space = 0; int efisize = EFI_LABEL_SIZE * 2; dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP); dk_ioc.dki_lba = 1; dk_ioc.dki_length = efisize; dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data; efi = dk_ioc.dki_data; if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc, FKIOCTL, kcred, NULL) == 0) { uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA); zfs_dbgmsg("vdev %s, capacity %llu, altern lba %llu", vd->vdev_path, capacity, efi_altern_lba); if (capacity > efi_altern_lba) avail_space = (capacity - efi_altern_lba) * blksz; } kmem_free(dk_ioc.dki_data, efisize); return (avail_space); }
/* * This sync task completes (finishes) a condense, deleting the old * mapping and replacing it with the new one. */ static void spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; objset_t *mos = spa->spa_meta_objset; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); uint64_t new_count = vdev_indirect_mapping_num_entries(sci->sci_new_mapping); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3P(sci, ==, spa->spa_condensing_indirect); for (int i = 0; i < TXG_SIZE; i++) { ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } ASSERT(vic->vic_mapping_object != 0); ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); /* * Reset vdev_indirect_mapping to refer to the new object. */ rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vd->vdev_indirect_mapping = sci->sci_new_mapping; rw_exit(&vd->vdev_indirect_rwlock); sci->sci_new_mapping = NULL; vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); vic->vic_mapping_object = scip->scip_next_mapping_object; scip->scip_next_mapping_object = 0; space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); scip->scip_prev_obsolete_sm_object = 0; scip->scip_vdev = 0; VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, tx)); spa_condensing_indirect_destroy(spa->spa_condensing_indirect); spa->spa_condensing_indirect = NULL; zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " "new mapping object %llu has %llu entries " "(was %llu entries)", vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, new_count, old_count); vdev_config_dirty(spa->spa_root_vdev); }
static void spa_condense_indirect_generate_new_mapping(vdev_t *vd, uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) { spa_t *spa = vd->vdev_spa; uint64_t mapi = start_index; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_num_entries = vdev_indirect_mapping_num_entries(old_mapping); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); zfs_dbgmsg("starting condense of vdev %llu from index %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); while (mapi < old_num_entries) { if (zthr_iscancelled(zthr)) { zfs_dbgmsg("pausing condense of vdev %llu " "at index %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); break; } vdev_indirect_mapping_entry_phys_t *entry = &old_mapping->vim_entries[mapi]; uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); ASSERT3U(obsolete_counts[mapi], <=, entry_size); if (obsolete_counts[mapi] < entry_size) { spa_condense_indirect_commit_entry(spa, entry, obsolete_counts[mapi]); /* * This delay may be requested for testing, debugging, * or performance reasons. */ delay(zfs_condense_indirect_commit_entry_delay_ticks); } mapi++; } }
/* * Sync task to begin the condensing process. */ void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; ASSERT0(scip->scip_next_mapping_object); ASSERT0(scip->scip_prev_obsolete_sm_object); ASSERT0(scip->scip_vdev); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); ASSERT(obsolete_sm_obj != 0); scip->scip_vdev = vd->vdev_id; scip->scip_next_mapping_object = vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; /* * We don't need to allocate a new space map object, since * vdev_indirect_sync_obsolete will allocate one when needed. */ space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), sizeof (*scip) / sizeof (uint64_t), scip, tx)); ASSERT3P(spa->spa_condensing_indirect, ==, NULL); spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " "posm=%llu nm=%llu", vd->vdev_id, dmu_tx_get_txg(tx), (u_longlong_t)scip->scip_prev_obsolete_sm_object, (u_longlong_t)scip->scip_next_mapping_object); zthr_wakeup(spa->spa_condense_zthr); }
static int zcp_debug(lua_State *state) { const char *dbgstring; zcp_run_info_t *ri = zcp_run_info(state); zcp_lib_info_t *libinfo = &zcp_debug_info; zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); dbgstring = lua_tostring(state, 1); zfs_dbgmsg("txg %lld ZCP: %s", ri->zri_tx->tx_txg, dbgstring); return (0); }
void range_tree_stat_verify(range_tree_t *rt) { range_seg_t *rs; uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; int i; for (rs = avl_first(&rt->rt_root); rs != NULL; rs = AVL_NEXT(&rt->rt_root, rs)) { uint64_t size = rs->rs_end - rs->rs_start; int idx = highbit64(size) - 1; hist[idx]++; ASSERT3U(hist[idx], !=, 0); } for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { if (hist[i] != rt->rt_histogram[i]) { zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu", i, hist, hist[i], rt->rt_histogram[i]); } VERIFY3U(hist[i], ==, rt->rt_histogram[i]); } }
static int vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *ashift) { spa_t *spa = vd->vdev_spa; vdev_disk_t *dvd; struct dk_minfo_ext dkmext; int error; dev_t dev; int otyp; boolean_t validate_devid = B_FALSE; ddi_devid_t devid; /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if (vd->vdev_tsd != NULL) { ASSERT(vd->vdev_reopening); dvd = vd->vdev_tsd; goto skip_open; } dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); /* * When opening a disk device, we want to preserve the user's original * intent. We always want to open the device by the path the user gave * us, even if it is one of multiple paths to the save device. But we * also want to be able to survive disks being removed/recabled. * Therefore the sequence of opening devices is: * * 1. Try opening the device by path. For legacy pools without the * 'whole_disk' property, attempt to fix the path by appending 's0'. * * 2. If the devid of the device matches the stored value, return * success. * * 3. Otherwise, the device may have moved. Try opening the device * by the devid instead. */ if (vd->vdev_devid != NULL) { if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, &dvd->vd_minor) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } } error = EINVAL; /* presume failure */ if (vd->vdev_path != NULL) { if (vd->vdev_wholedisk == -1ULL) { size_t len = strlen(vd->vdev_path) + 3; char *buf = kmem_alloc(len, KM_SLEEP); ldi_handle_t lh; (void) snprintf(buf, len, "%ss0", vd->vdev_path); if (ldi_open_by_name(buf, spa_mode(spa), kcred, &lh, zfs_li) == 0) { spa_strfree(vd->vdev_path); vd->vdev_path = buf; vd->vdev_wholedisk = 1ULL; (void) ldi_close(lh, spa_mode(spa), kcred); } else { kmem_free(buf, len); } } error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); /* * Compare the devid to the stored value. */ if (error == 0 && vd->vdev_devid != NULL && ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { error = SET_ERROR(EINVAL); (void) ldi_close(dvd->vd_lh, spa_mode(spa), kcred); dvd->vd_lh = NULL; } ddi_devid_free(devid); } /* * If we succeeded in opening the device, but 'vdev_wholedisk' * is not yet set, then this must be a slice. */ if (error == 0 && vd->vdev_wholedisk == -1ULL) vd->vdev_wholedisk = 0; } /* * If we were unable to open by path, or the devid check fails, open by * devid instead. */ if (error != 0 && vd->vdev_devid != NULL) { error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } /* * If all else fails, then try opening by physical path (if available) * or the logical path (if we failed due to the devid check). While not * as reliable as the devid, this will give us something, and the higher * level vdev validation will prevent us from opening the wrong device. */ if (error) { if (vd->vdev_devid != NULL) validate_devid = B_TRUE; if (vd->vdev_physpath != NULL && (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); /* * Note that we don't support the legacy auto-wholedisk support * as above. This hasn't been used in a very long time and we * don't need to propagate its oddities to this edge condition. */ if (error && vd->vdev_path != NULL) error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } /* * Now that the device has been successfully opened, update the devid * if necessary. */ if (validate_devid && spa_writeable(spa) && ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { char *vd_devid; vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor); zfs_dbgmsg("vdev %s: update devid from %s, " "to %s", vd->vdev_path, vd->vdev_devid, vd_devid); spa_strfree(vd->vdev_devid); vd->vdev_devid = spa_strdup(vd_devid); ddi_devid_str_free(vd_devid); } ddi_devid_free(devid); } /* * Once a device is opened, verify that the physical device path (if * available) is up to date. */ if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { char *physpath, *minorname; physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); minorname = NULL; if (ddi_dev_pathname(dev, otyp, physpath) == 0 && ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && (vd->vdev_physpath == NULL || strcmp(vd->vdev_physpath, physpath) != 0)) { if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); (void) strlcat(physpath, ":", MAXPATHLEN); (void) strlcat(physpath, minorname, MAXPATHLEN); vd->vdev_physpath = spa_strdup(physpath); } if (minorname) kmem_free(minorname, strlen(minorname) + 1); kmem_free(physpath, MAXPATHLEN); } skip_open: /* * Determine the actual size of the device. */ if (ldi_get_size(dvd->vd_lh, psize) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (SET_ERROR(EINVAL)); } /* * Determine the device's minimum transfer size. * If the ioctl isn't supported, assume DEV_BSIZE. */ if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)&dkmext, FKIOCTL, kcred, NULL) != 0) dkmext.dki_pbsize = DEV_BSIZE; *ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1; if (vd->vdev_wholedisk == 1) { uint64_t capacity = dkmext.dki_capacity - 1; uint64_t blksz = dkmext.dki_lbsize; int wce = 1; /* * If we own the whole disk, try to enable disk write caching. * We ignore errors because it's OK if we can't do it. */ (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, FKIOCTL, kcred, NULL); *max_psize = *psize + vdev_disk_get_space(vd, capacity, blksz); zfs_dbgmsg("capacity change: vdev %s, psize %llu, " "max_psize %llu", vd->vdev_path, *psize, *max_psize); } else { *max_psize = *psize; } /* * Clear the nowritecache bit, so that on a vdev_reopen() we will * try again. */ vd->vdev_nowritecache = B_FALSE; return (0); }
/*ARGSUSED*/ static void spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) { spa_t *spa = arg1; history_arg_t *hap = arg2; const char *history_str = hap->ha_history_str; objset_t *mos = spa->spa_meta_objset; dmu_buf_t *dbp; spa_history_phys_t *shpp; size_t reclen; uint64_t le_len; nvlist_t *nvrecord; char *record_packed = NULL; int ret; /* * If we have an older pool that doesn't have a command * history object, create it now. */ mutex_enter(&spa->spa_history_lock); if (!spa->spa_history) spa_history_create_obj(spa, tx); mutex_exit(&spa->spa_history_lock); /* * Get the offset of where we need to write via the bonus buffer. * Update the offset when the write completes. */ VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); shpp = dbp->db_data; dmu_buf_will_dirty(dbp, tx); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbp, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); } #endif VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME, gethrestime_sec()) == 0); VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, hap->ha_uid) == 0); if (hap->ha_zone != NULL) VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE, hap->ha_zone) == 0); #ifdef _KERNEL VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_HOST, utsname.nodename) == 0); #endif if (hap->ha_log_type == LOG_CMD_POOL_CREATE || hap->ha_log_type == LOG_CMD_NORMAL) { VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, history_str) == 0); zfs_dbgmsg("command: %s", history_str); } else { VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT, hap->ha_event) == 0); VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TXG, tx->tx_txg) == 0); VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR, history_str) == 0); zfs_dbgmsg("internal %s pool:%s txg:%llu %s", zfs_history_event_names[hap->ha_event], spa_name(spa), (longlong_t)tx->tx_txg, history_str); } VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0); record_packed = kmem_alloc(reclen, KM_SLEEP); VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen, NV_ENCODE_XDR, KM_SLEEP) == 0); mutex_enter(&spa->spa_history_lock); if (hap->ha_log_type == LOG_CMD_POOL_CREATE) VERIFY(shpp->sh_eof == shpp->sh_pool_create_len); /* write out the packed length as little endian */ le_len = LE_64((uint64_t)reclen); ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); if (!ret) ret = spa_history_write(spa, record_packed, reclen, shpp, tx); if (!ret && hap->ha_log_type == LOG_CMD_POOL_CREATE) { shpp->sh_pool_create_len += sizeof (le_len) + reclen; shpp->sh_bof = shpp->sh_pool_create_len; } mutex_exit(&spa->spa_history_lock); nvlist_free(nvrecord); kmem_free(record_packed, reclen); dmu_buf_rele(dbp, FTAG); strfree(hap->ha_history_str); if (hap->ha_zone != NULL) strfree(hap->ha_zone); kmem_free(hap, sizeof (history_arg_t)); }
void dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset) { /* * There should not be anything wrong with having kstats for * snapshots. Since we are not sure how useful they would be * though nor how much their memory overhead would matter in * a filesystem with many snapshots, we skip them for now. */ if (dmu_objset_is_snapshot(objset)) return; /* * At the time of this writing, KSTAT_STRLEN is 255 in Linux, * and the spa_name can theoretically be up to 256 characters. * In reality though the spa_name can be 240 characters max * [see origin directory name check in pool_namecheck()]. Thus, * the naming scheme for the module name below should not cause * any truncations. In the event that a truncation does happen * though, due to some future change, we silently skip creating * the kstat and log the event. */ char kstat_module_name[KSTAT_STRLEN]; int n = snprintf(kstat_module_name, sizeof (kstat_module_name), "zfs/%s", spa_name(dmu_objset_spa(objset))); if (n < 0) { zfs_dbgmsg("failed to create dataset kstat for objset %lld: " " snprintf() for kstat module name returned %d", (unsigned long long)dmu_objset_id(objset), n); return; } else if (n >= KSTAT_STRLEN) { zfs_dbgmsg("failed to create dataset kstat for objset %lld: " "kstat module name length (%d) exceeds limit (%d)", (unsigned long long)dmu_objset_id(objset), n, KSTAT_STRLEN); return; } char kstat_name[KSTAT_STRLEN]; n = snprintf(kstat_name, sizeof (kstat_name), "objset-0x%llx", (unsigned long long)dmu_objset_id(objset)); if (n < 0) { zfs_dbgmsg("failed to create dataset kstat for objset %lld: " " snprintf() for kstat name returned %d", (unsigned long long)dmu_objset_id(objset), n); return; } ASSERT3U(n, <, KSTAT_STRLEN); kstat_t *kstat = kstat_create(kstat_module_name, 0, kstat_name, "dataset", KSTAT_TYPE_NAMED, sizeof (empty_dataset_kstats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (kstat == NULL) return; dataset_kstat_values_t *dk_kstats = kmem_alloc(sizeof (empty_dataset_kstats), KM_SLEEP); bcopy(&empty_dataset_kstats, dk_kstats, sizeof (empty_dataset_kstats)); char *ds_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(objset->os_dsl_dataset, ds_name); KSTAT_NAMED_STR_PTR(&dk_kstats->dkv_ds_name) = ds_name; KSTAT_NAMED_STR_BUFLEN(&dk_kstats->dkv_ds_name) = ZFS_MAX_DATASET_NAME_LEN; kstat->ks_data = dk_kstats; kstat->ks_update = dataset_kstats_update; kstat->ks_private = dk; kstat_install(kstat); dk->dk_kstats = kstat; aggsum_init(&dk->dk_aggsums.das_writes, 0); aggsum_init(&dk->dk_aggsums.das_nwritten, 0); aggsum_init(&dk->dk_aggsums.das_reads, 0); aggsum_init(&dk->dk_aggsums.das_nread, 0); aggsum_init(&dk->dk_aggsums.das_nunlinks, 0); aggsum_init(&dk->dk_aggsums.das_nunlinked, 0); }
/*ARGSUSED*/ static void spa_history_log_sync(void *arg, dmu_tx_t *tx) { nvlist_t *nvl = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; dmu_buf_t *dbp; spa_history_phys_t *shpp; size_t reclen; uint64_t le_len; char *record_packed = NULL; int ret; /* * If we have an older pool that doesn't have a command * history object, create it now. */ mutex_enter(&spa->spa_history_lock); if (!spa->spa_history) spa_history_create_obj(spa, tx); mutex_exit(&spa->spa_history_lock); /* * Get the offset of where we need to write via the bonus buffer. * Update the offset when the write completes. */ VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); shpp = dbp->db_data; dmu_buf_will_dirty(dbp, tx); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbp, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); } #endif fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); #ifdef _KERNEL fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename); #endif if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) { zfs_dbgmsg("command: %s", fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD)); } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) { if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) { zfs_dbgmsg("txg %lld %s %s (id %llu) %s", fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME), fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); } else { zfs_dbgmsg("txg %lld %s %s", fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); } } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { zfs_dbgmsg("ioctl %s", fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL)); } record_packed = fnvlist_pack(nvl, &reclen); mutex_enter(&spa->spa_history_lock); /* write out the packed length as little endian */ le_len = LE_64((uint64_t)reclen); ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); if (!ret) ret = spa_history_write(spa, record_packed, reclen, shpp, tx); /* The first command is the create, which we keep forever */ if (ret == 0 && shpp->sh_pool_create_len == 0 && nvlist_exists(nvl, ZPOOL_HIST_CMD)) { shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof; } mutex_exit(&spa->spa_history_lock); fnvlist_pack_free(record_packed, reclen); dmu_buf_rele(dbp, FTAG); fnvlist_free(nvl); }
boolean_t vdev_indirect_should_condense(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; spa_t *spa = vd->vdev_spa; ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); if (!zfs_condense_indirect_vdevs_enable) return (B_FALSE); /* * We can only condense one indirect vdev at a time. */ if (spa->spa_condensing_indirect != NULL) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); /* * The mapping object size must not change while we are * condensing, so we can only condense indirect vdevs * (not vdevs that are still in the middle of being removed). */ if (vd->vdev_ops != &vdev_indirect_ops) return (B_FALSE); /* * If nothing new has been marked obsolete, there is no * point in condensing. */ if (vd->vdev_obsolete_sm == NULL) { ASSERT0(vdev_obsolete_sm_object(vd)); return (B_FALSE); } ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(vdev_obsolete_sm_object(vd), ==, space_map_object(vd->vdev_obsolete_sm)); uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); uint64_t mapping_size = vdev_indirect_mapping_size(vim); uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); ASSERT3U(bytes_obsolete, <=, bytes_mapped); /* * If a high percentage of the bytes that are mapped have become * obsolete, condense (unless the mapping is already small enough). * This has a good chance of reducing the amount of memory used * by the mapping. */ if (bytes_obsolete * 100 / bytes_mapped >= zfs_indirect_condense_obsolete_pct && mapping_size > zfs_condense_min_mapping_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete " "spacemap covers %d%% of %lluMB mapping", (u_longlong_t)vd->vdev_id, (int)(bytes_obsolete * 100 / bytes_mapped), (u_longlong_t)bytes_mapped / 1024 / 1024); return (B_TRUE); } /* * If the obsolete space map takes up too much space on disk, * condense in order to free up this disk space. */ if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete sm " "length %lluMB >= max size %lluMB", (u_longlong_t)vd->vdev_id, (u_longlong_t)obsolete_sm_size / 1024 / 1024, (u_longlong_t)zfs_condense_max_obsolete_bytes / 1024 / 1024); return (B_TRUE); } return (B_FALSE); }
static int vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *ashift) { spa_t *spa = vd->vdev_spa; vdev_disk_t *dvd = vd->vdev_tsd; vnode_t *devvp = NULLVP; vfs_context_t context = NULL; uint64_t blkcnt; uint32_t blksize; int fmode = 0; int error = 0; int isssd; /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if (dvd != NULL) { if (dvd->vd_offline) { /* * If we are opening a device in its offline notify * context, the LDI handle was just closed. Clean * up the LDI event callbacks and free vd->vdev_tsd. */ vdev_disk_free(vd); } else { ASSERT(vd->vdev_reopening); devvp = dvd->vd_devvp; goto skip_open; } } /* * Create vd->vdev_tsd. */ vdev_disk_alloc(vd); dvd = vd->vdev_tsd; /* * When opening a disk device, we want to preserve the user's original * intent. We always want to open the device by the path the user gave * us, even if it is one of multiple paths to the same device. But we * also want to be able to survive disks being removed/recabled. * Therefore the sequence of opening devices is: * * 1. Try opening the device by path. For legacy pools without the * 'whole_disk' property, attempt to fix the path by appending 's0'. * * 2. If the devid of the device matches the stored value, return * success. * * 3. Otherwise, the device may have moved. Try opening the device * by the devid instead. */ /* ### APPLE TODO ### */ #ifdef illumos if (vd->vdev_devid != NULL) { if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, &dvd->vd_minor) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } } #endif error = EINVAL; /* presume failure */ if (vd->vdev_path != NULL) { context = vfs_context_create( spl_vfs_context_kernel() ); /* Obtain an opened/referenced vnode for the device. */ if ((error = vnode_open(vd->vdev_path, spa_mode(spa), 0, 0, &devvp, context))) { goto out; } if (!vnode_isblk(devvp)) { error = ENOTBLK; goto out; } /* * ### APPLE TODO ### * vnode_authorize devvp for KAUTH_VNODE_READ_DATA and * KAUTH_VNODE_WRITE_DATA */ /* * Disallow opening of a device that is currently in use. * Flush out any old buffers remaining from a previous use. */ if ((error = vfs_mountedon(devvp))) { goto out; } if (VNOP_FSYNC(devvp, MNT_WAIT, context) != 0) { error = ENOTBLK; goto out; } if ((error = buf_invalidateblks(devvp, BUF_WRITE_DATA, 0, 0))) { goto out; } } else { goto out; } int len = MAXPATHLEN; if (vn_getpath(devvp, dvd->vd_readlinkname, &len) == 0) { dprintf("ZFS: '%s' resolved name is '%s'\n", vd->vdev_path, dvd->vd_readlinkname); } else { dvd->vd_readlinkname[0] = 0; } skip_open: /* * Determine the actual size of the device. */ if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, context) != 0 || VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, context) != 0) { error = EINVAL; goto out; } *psize = blkcnt * (uint64_t)blksize; *max_psize = *psize; dvd->vd_ashift = highbit(blksize) - 1; dprintf("vdev_disk: Device %p ashift set to %d\n", devvp, dvd->vd_ashift); *ashift = highbit(MAX(blksize, SPA_MINBLOCKSIZE)) - 1; /* * ### APPLE TODO ### */ #ifdef illumos if (vd->vdev_wholedisk == 1) { int wce = 1; if (error == 0) { /* * If we have the capability to expand, we'd have * found out via success from DKIOCGMEDIAINFO{,EXT}. * Adjust max_psize upward accordingly since we know * we own the whole disk now. */ *max_psize += vdev_disk_get_space(vd, capacity, blksz); zfs_dbgmsg("capacity change: vdev %s, psize %llu, " "max_psize %llu", vd->vdev_path, *psize, *max_psize); } /* * Since we own the whole disk, try to enable disk write * caching. We ignore errors because it's OK if we can't do it. */ (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, FKIOCTL, kcred, NULL); } #endif /* * Clear the nowritecache bit, so that on a vdev_reopen() we will * try again. */ vd->vdev_nowritecache = B_FALSE; /* Inform the ZIO pipeline that we are non-rotational */ vd->vdev_nonrot = B_FALSE; if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, context) == 0) { if (isssd) vd->vdev_nonrot = B_TRUE; } dprintf("ZFS: vdev_disk(%s) isSSD %d\n", vd->vdev_path ? vd->vdev_path : "", isssd); dvd->vd_devvp = devvp; out: if (error) { if (devvp) { vnode_close(devvp, fmode, context); dvd->vd_devvp = NULL; } vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; } if (context) (void) vfs_context_rele(context); if (error) printf("ZFS: vdev_disk_open('%s') failed error %d\n", vd->vdev_path ? vd->vdev_path : "", error); return (error); }
/*ARGSUSED*/ static void spa_history_log_sync(void *arg, dmu_tx_t *tx) { nvlist_t *nvl = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; dmu_buf_t *dbp; spa_history_phys_t *shpp; size_t reclen; uint64_t le_len; char *record_packed = NULL; int ret; /* * If we have an older pool that doesn't have a command * history object, create it now. */ mutex_enter(&spa->spa_history_lock); if (!spa->spa_history) spa_history_create_obj(spa, tx); mutex_exit(&spa->spa_history_lock); /* * Get the offset of where we need to write via the bonus buffer. * Update the offset when the write completes. */ VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); shpp = dbp->db_data; dmu_buf_will_dirty(dbp, tx); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbp, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); } #endif fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname()->nodename); if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) { zfs_dbgmsg("command: %s", fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD)); } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) { if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) { zfs_dbgmsg("txg %lld %s %s (id %llu) %s", fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME), fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); } else { zfs_dbgmsg("txg %lld %s %s", fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); } /* * The history sysevent is posted only for internal history * messages to show what has happened, not how it happened. For * example, the following command: * * # zfs destroy -r tank/foo * * will result in one sysevent posted per dataset that is * destroyed as a result of the command - which could be more * than one event in total. By contrast, if the sysevent was * posted as a result of the ZPOOL_HIST_CMD key being present * it would result in only one sysevent being posted with the * full command line arguments, requiring the consumer to know * how to parse and understand zfs(1M) command invocations. */ spa_history_log_notify(spa, nvl); } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { zfs_dbgmsg("ioctl %s", fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL)); } VERIFY3U(nvlist_pack(nvl, &record_packed, &reclen, NV_ENCODE_NATIVE, KM_SLEEP), ==, 0); mutex_enter(&spa->spa_history_lock); /* write out the packed length as little endian */ le_len = LE_64((uint64_t)reclen); ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); if (!ret) ret = spa_history_write(spa, record_packed, reclen, shpp, tx); /* The first command is the create, which we keep forever */ if (ret == 0 && shpp->sh_pool_create_len == 0 && nvlist_exists(nvl, ZPOOL_HIST_CMD)) { shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof; } mutex_exit(&spa->spa_history_lock); fnvlist_pack_free(record_packed, reclen); dmu_buf_rele(dbp, FTAG); fnvlist_free(nvl); }
static int vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *ashift) { spa_t *spa = vd->vdev_spa; vdev_disk_t *dvd; ldi_ev_cookie_t ecookie; vdev_disk_ldi_cb_t *lcb; union { struct dk_minfo_ext ude; struct dk_minfo ud; } dks; struct dk_minfo_ext *dkmext = &dks.ude; struct dk_minfo *dkm = &dks.ud; int error; dev_t dev; int otyp; boolean_t validate_devid = B_FALSE; ddi_devid_t devid; uint64_t capacity = 0, blksz = 0, pbsize; /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } rw_enter(&vd->vdev_tsd_lock, RW_WRITER); dvd = vd->vdev_tsd; /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if (dvd != NULL) { ASSERT(vd->vdev_reopening); /* * Here vd_lh is protected by vdev_tsd_lock */ ASSERT(dvd->vd_lh != NULL); /* This should not happen, but let's be safe */ if (dvd->vd_lh == NULL) { /* What are we going to do here??? */ rw_exit(&vd->vdev_tsd_lock); return (SET_ERROR(ENXIO)); } goto skip_open; } /* * Create dvd to be used as vd->vdev_tsd. */ vd->vdev_tsd = dvd = vdev_disk_alloc(); /* * When opening a disk device, we want to preserve the user's original * intent. We always want to open the device by the path the user gave * us, even if it is one of multiple paths to the same device. But we * also want to be able to survive disks being removed/recabled. * Therefore the sequence of opening devices is: * * 1. Try opening the device by path. For legacy pools without the * 'whole_disk' property, attempt to fix the path by appending 's0'. * * 2. If the devid of the device matches the stored value, return * success. * * 3. Otherwise, the device may have moved. Try opening the device * by the devid instead. */ if (vd->vdev_devid != NULL) { if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, &dvd->vd_minor) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; vdev_disk_free_locked(vd); rw_exit(&vd->vdev_tsd_lock); return (SET_ERROR(EINVAL)); } } error = EINVAL; /* presume failure */ if (vd->vdev_path != NULL) { if (vd->vdev_wholedisk == -1ULL) { size_t len = strlen(vd->vdev_path) + 3; char *buf = kmem_alloc(len, KM_SLEEP); (void) snprintf(buf, len, "%ss0", vd->vdev_path); error = ldi_open_by_name(buf, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); if (error == 0) { spa_strfree(vd->vdev_path); vd->vdev_path = buf; vd->vdev_wholedisk = 1ULL; } else { kmem_free(buf, len); } } /* * If we have not yet opened the device, try to open it by the * specified path. */ if (error != 0) { error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } /* * Compare the devid to the stored value. */ if (error == 0 && vd->vdev_devid != NULL && ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { error = SET_ERROR(EINVAL); (void) ldi_close(dvd->vd_lh, spa_mode(spa), kcred); dvd->vd_lh = NULL; } ddi_devid_free(devid); } /* * If we succeeded in opening the device, but 'vdev_wholedisk' * is not yet set, then this must be a slice. */ if (error == 0 && vd->vdev_wholedisk == -1ULL) vd->vdev_wholedisk = 0; } /* * If we were unable to open by path, or the devid check fails, open by * devid instead. */ if (error != 0 && vd->vdev_devid != NULL) { error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } /* * If all else fails, then try opening by physical path (if available) * or the logical path (if we failed due to the devid check). While not * as reliable as the devid, this will give us something, and the higher * level vdev validation will prevent us from opening the wrong device. */ if (error) { if (vd->vdev_devid != NULL) validate_devid = B_TRUE; if (vd->vdev_physpath != NULL && (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); /* * Note that we don't support the legacy auto-wholedisk support * as above. This hasn't been used in a very long time and we * don't need to propagate its oddities to this edge condition. */ if (error && vd->vdev_path != NULL) error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; vdev_disk_free_locked(vd); rw_exit(&vd->vdev_tsd_lock); return (error); } /* * Now that the device has been successfully opened, update the devid * if necessary. */ if (validate_devid && spa_writeable(spa) && ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { char *vd_devid; vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor); zfs_dbgmsg("vdev %s: update devid from %s, " "to %s", vd->vdev_path, vd->vdev_devid, vd_devid); spa_strfree(vd->vdev_devid); vd->vdev_devid = spa_strdup(vd_devid); ddi_devid_str_free(vd_devid); } ddi_devid_free(devid); } /* * Once a device is opened, verify that the physical device path (if * available) is up to date. */ if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { char *physpath, *minorname; physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); minorname = NULL; if (ddi_dev_pathname(dev, otyp, physpath) == 0 && ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && (vd->vdev_physpath == NULL || strcmp(vd->vdev_physpath, physpath) != 0)) { if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); (void) strlcat(physpath, ":", MAXPATHLEN); (void) strlcat(physpath, minorname, MAXPATHLEN); vd->vdev_physpath = spa_strdup(physpath); } if (minorname) kmem_free(minorname, strlen(minorname) + 1); kmem_free(physpath, MAXPATHLEN); } /* * Register callbacks for the LDI offline event. */ if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) == LDI_EV_SUCCESS) { lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); list_insert_tail(&dvd->vd_ldi_cbs, lcb); (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id); } /* * Register callbacks for the LDI degrade event. */ if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) == LDI_EV_SUCCESS) { lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); list_insert_tail(&dvd->vd_ldi_cbs, lcb); (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id); } /* Reset TRIM flag, as underlying device support may have changed */ vd->vdev_notrim = B_FALSE; skip_open: ASSERT(dvd != NULL); /* * Determine the actual size of the device. */ if (ldi_get_size(dvd->vd_lh, psize) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; vdev_disk_free_locked(vd); rw_exit(&vd->vdev_tsd_lock); return (SET_ERROR(EINVAL)); } *max_psize = *psize; /* * Determine the device's minimum transfer size. * If the ioctl isn't supported, assume DEV_BSIZE. */ if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) { capacity = dkmext->dki_capacity - 1; blksz = dkmext->dki_lbsize; pbsize = dkmext->dki_pbsize; } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) { VDEV_DEBUG( "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n", vd->vdev_path); capacity = dkm->dki_capacity - 1; blksz = dkm->dki_lbsize; pbsize = blksz; } else { VDEV_DEBUG("vdev_disk_open(\"%s\"): " "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n", vd->vdev_path, error); pbsize = DEV_BSIZE; } *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; if (vd->vdev_wholedisk == 1) { int wce = 1; if (error == 0) { /* * If we have the capability to expand, we'd have * found out via success from DKIOCGMEDIAINFO{,EXT}. * Adjust max_psize upward accordingly since we know * we own the whole disk now. */ *max_psize += vdev_disk_get_space(vd, capacity, blksz); zfs_dbgmsg("capacity change: vdev %s, psize %llu, " "max_psize %llu", vd->vdev_path, *psize, *max_psize); } /* * Since we own the whole disk, try to enable disk write * caching. We ignore errors because it's OK if we can't do it. */ (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, FKIOCTL, kcred, NULL); } /* * We are done with vd_lh and vdev_tsd, release the vdev_tsd_lock */ rw_exit(&vd->vdev_tsd_lock); /* * Clear the nowritecache bit, so that on a vdev_reopen() we will * try again. */ vd->vdev_nowritecache = B_FALSE; return (0); }