/* * Through message handle for write side stream * * Requires Lock (( M: Mandatory, P: Prohibited, A: Allowed )) * -. uinst_t->lock : M [RW_READER or RW_WRITER] * -. uinst_t->u_lock : P * -. uinst_t->l_lock : P * -. uinst_t->c_lock : P */ int oplmsu_wcmn_through_hndl(queue_t *q, mblk_t *mp, int pri_flag, krw_t rw) { queue_t *usr_queue = NULL, *dst_queue = NULL; ctrl_t *ctrl; ASSERT(RW_LOCK_HELD(&oplmsu_uinst->lock)); mutex_enter(&oplmsu_uinst->c_lock); if ((ctrl = oplmsu_uinst->user_ctrl) != NULL) { usr_queue = ctrl->queue; mutex_exit(&oplmsu_uinst->c_lock); } else { mutex_exit(&oplmsu_uinst->c_lock); if (mp->b_datap->db_type == M_IOCTL) { rw_exit(&oplmsu_uinst->lock); oplmsu_iocack(q, mp, ENODEV); rw_enter(&oplmsu_uinst->lock, rw); } else { freemsg(mp); } return (SUCCESS); } if (oplmsu_uinst->lower_queue != NULL) { dst_queue = WR(oplmsu_uinst->lower_queue); } else { cmn_err(CE_WARN, "!oplmsu: through-lwq: " "Active path doesn't exist"); if (mp->b_datap->db_type == M_IOCTL) { rw_exit(&oplmsu_uinst->lock); oplmsu_iocack(q, mp, ENODEV); rw_enter(&oplmsu_uinst->lock, rw); } else { freemsg(mp); } return (SUCCESS); } if ((usr_queue == WR(q)) || (usr_queue == RD(q))) { if (pri_flag == MSU_HIGH) { putq(dst_queue, mp); } else { if (canput(dst_queue)) { putq(dst_queue, mp); } else { oplmsu_wcmn_norm_putbq(WR(q), mp, dst_queue); return (FAILURE); } } } else { cmn_err(CE_WARN, "oplmsu: through-lwq: " "Inappropriate message for this node"); if (mp->b_datap->db_type == M_IOCTL) { rw_exit(&oplmsu_uinst->lock); oplmsu_iocack(q, mp, ENODEV); rw_enter(&oplmsu_uinst->lock, rw); } else { freemsg(mp); } } return (SUCCESS); }
void dmu_objset_evict(objset_t *os) { int t; dsl_dataset_t *ds = os->os_dsl_dataset; for (t = 0; t < TXG_SIZE; t++) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { if (!dsl_dataset_is_snapshot(ds)) { VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SYNC), sync_changed_cb, os)); } VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os)); } if (os->os_sa) sa_tear_down(os); dmu_objset_evict_dbufs(os); dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { dnode_special_close(&os->os_userused_dnode); dnode_special_close(&os->os_groupused_dnode); } zil_free(os->os_zil); ASSERT3P(list_head(&os->os_dnodes), ==, NULL); VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); /* * This is a barrier to prevent the objset from going away in * dnode_move() until we can safely ensure that the objset is still in * use. We consider the objset valid before the barrier and invalid * after the barrier. */ rw_enter(&os_lock, RW_READER); rw_exit(&os_lock); mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); kmem_free(os, sizeof (objset_t)); }
static void devvt_cleandir(struct vnode *dvp, struct cred *cred) { struct sdev_node *sdvp = VTOSDEV(dvp); struct sdev_node *dv, *next = NULL; int min, cnt; char found = 0; mutex_enter(&vc_lock); cnt = VC_INSTANCES_COUNT; mutex_exit(&vc_lock); /* We have to fool warlock this way, otherwise it will complain */ #ifndef __lock_lint if (rw_tryupgrade(&sdvp->sdev_contents) == NULL) { rw_exit(&sdvp->sdev_contents); rw_enter(&sdvp->sdev_contents, RW_WRITER); } #else rw_enter(&sdvp->sdev_contents, RW_WRITER); #endif /* 1. prune invalid nodes and rebuild stale symlinks */ devvt_prunedir(sdvp); /* 2. create missing nodes */ for (min = 0; min < cnt; min++) { char nm[16]; if (vt_minor_valid(min) == B_FALSE) continue; (void) snprintf(nm, sizeof (nm), "%d", min); found = 0; for (dv = SDEV_FIRST_ENTRY(sdvp); dv; dv = next) { next = SDEV_NEXT_ENTRY(sdvp, dv); /* validate only ready nodes */ if (dv->sdev_state != SDEV_READY) continue; if (strcmp(nm, dv->sdev_name) == 0) { found = 1; break; } } if (!found) { devvt_create_snode(sdvp, nm, cred, SDEV_VATTR); } } /* 3. create active link node and console user link node */ found = 0; for (dv = SDEV_FIRST_ENTRY(sdvp); dv; dv = next) { next = SDEV_NEXT_ENTRY(sdvp, dv); /* validate only ready nodes */ if (dv->sdev_state != SDEV_READY) continue; if ((strcmp(dv->sdev_name, DEVVT_ACTIVE_NAME) == NULL)) found |= 0x01; if ((strcmp(dv->sdev_name, DEVVT_CONSUSER_NAME) == NULL)) found |= 0x02; if ((found & 0x01) && (found & 0x02)) break; } if (!(found & 0x01)) devvt_create_snode(sdvp, DEVVT_ACTIVE_NAME, cred, SDEV_VLINK); if (!(found & 0x02)) devvt_create_snode(sdvp, DEVVT_CONSUSER_NAME, cred, SDEV_VLINK); #ifndef __lock_lint rw_downgrade(&sdvp->sdev_contents); #else rw_exit(&sdvp->sdev_contents); #endif }
void memlist_read_lock(void) { rw_enter(&memlists_lock, RW_READER); }
/* ARGSUSED */ int mac_register(mac_register_t *mregp, mac_handle_t *mhp) { mac_impl_t *mip; mactype_t *mtype; int err = EINVAL; struct devnames *dnp = NULL; uint_t instance; boolean_t style1_created = B_FALSE; boolean_t style2_created = B_FALSE; char *driver; minor_t minor = 0; /* A successful call to mac_init_ops() sets the DN_GLDV3_DRIVER flag. */ if (!GLDV3_DRV(ddi_driver_major(mregp->m_dip))) return (EINVAL); /* Find the required MAC-Type plugin. */ if ((mtype = mactype_getplugin(mregp->m_type_ident)) == NULL) return (EINVAL); /* Create a mac_impl_t to represent this MAC. */ mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP); /* * The mac is not ready for open yet. */ mip->mi_state_flags |= MIS_DISABLED; /* * When a mac is registered, the m_instance field can be set to: * * 0: Get the mac's instance number from m_dip. * This is usually used for physical device dips. * * [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number. * For example, when an aggregation is created with the key option, * "key" will be used as the instance number. * * -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1]. * This is often used when a MAC of a virtual link is registered * (e.g., aggregation when "key" is not specified, or vnic). * * Note that the instance number is used to derive the mi_minor field * of mac_impl_t, which will then be used to derive the name of kstats * and the devfs nodes. The first 2 cases are needed to preserve * backward compatibility. */ switch (mregp->m_instance) { case 0: instance = ddi_get_instance(mregp->m_dip); break; case ((uint_t)-1): minor = mac_minor_hold(B_TRUE); if (minor == 0) { err = ENOSPC; goto fail; } instance = minor - 1; break; default: instance = mregp->m_instance; if (instance >= MAC_MAX_MINOR) { err = EINVAL; goto fail; } break; } mip->mi_minor = (minor_t)(instance + 1); mip->mi_dip = mregp->m_dip; mip->mi_clients_list = NULL; mip->mi_nclients = 0; /* Set the default IEEE Port VLAN Identifier */ mip->mi_pvid = 1; /* Default bridge link learning protection values */ mip->mi_llimit = 1000; mip->mi_ldecay = 200; driver = (char *)ddi_driver_name(mip->mi_dip); /* Construct the MAC name as <drvname><instance> */ (void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d", driver, instance); mip->mi_driver = mregp->m_driver; mip->mi_type = mtype; mip->mi_margin = mregp->m_margin; mip->mi_info.mi_media = mtype->mt_type; mip->mi_info.mi_nativemedia = mtype->mt_nativetype; if (mregp->m_max_sdu <= mregp->m_min_sdu) goto fail; if (mregp->m_multicast_sdu == 0) mregp->m_multicast_sdu = mregp->m_max_sdu; if (mregp->m_multicast_sdu < mregp->m_min_sdu || mregp->m_multicast_sdu > mregp->m_max_sdu) goto fail; mip->mi_sdu_min = mregp->m_min_sdu; mip->mi_sdu_max = mregp->m_max_sdu; mip->mi_sdu_multicast = mregp->m_multicast_sdu; mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length; /* * If the media supports a broadcast address, cache a pointer to it * in the mac_info_t so that upper layers can use it. */ mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr; mip->mi_v12n_level = mregp->m_v12n; /* * Copy the unicast source address into the mac_info_t, but only if * the MAC-Type defines a non-zero address length. We need to * handle MAC-Types that have an address length of 0 * (point-to-point protocol MACs for example). */ if (mip->mi_type->mt_addr_length > 0) { if (mregp->m_src_addr == NULL) goto fail; mip->mi_info.mi_unicst_addr = kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP); bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length); /* * Copy the fixed 'factory' MAC address from the immutable * info. This is taken to be the MAC address currently in * use. */ bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr, mip->mi_type->mt_addr_length); /* * At this point, we should set up the classification * rules etc but we delay it till mac_open() so that * the resource discovery has taken place and we * know someone wants to use the device. Otherwise * memory gets allocated for Rx ring structures even * during probe. */ /* Copy the destination address if one is provided. */ if (mregp->m_dst_addr != NULL) { bcopy(mregp->m_dst_addr, mip->mi_dstaddr, mip->mi_type->mt_addr_length); mip->mi_dstaddr_set = B_TRUE; } } else if (mregp->m_src_addr != NULL) { goto fail; } /* * The format of the m_pdata is specific to the plugin. It is * passed in as an argument to all of the plugin callbacks. The * driver can update this information by calling * mac_pdata_update(). */ if (mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY) { /* * Verify if the supplied plugin data is valid. Note that * even if the caller passed in a NULL pointer as plugin data, * we still need to verify if that's valid as the plugin may * require plugin data to function. */ if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata, mregp->m_pdata_size)) { goto fail; } if (mregp->m_pdata != NULL) { mip->mi_pdata = kmem_alloc(mregp->m_pdata_size, KM_SLEEP); bcopy(mregp->m_pdata, mip->mi_pdata, mregp->m_pdata_size); mip->mi_pdata_size = mregp->m_pdata_size; } } else if (mregp->m_pdata != NULL) { /* * The caller supplied non-NULL plugin data, but the plugin * does not recognize plugin data. */ err = EINVAL; goto fail; } /* * Register the private properties. */ mac_register_priv_prop(mip, mregp->m_priv_props); /* * Stash the driver callbacks into the mac_impl_t, but first sanity * check to make sure all mandatory callbacks are set. */ if (mregp->m_callbacks->mc_getstat == NULL || mregp->m_callbacks->mc_start == NULL || mregp->m_callbacks->mc_stop == NULL || mregp->m_callbacks->mc_setpromisc == NULL || mregp->m_callbacks->mc_multicst == NULL) { goto fail; } mip->mi_callbacks = mregp->m_callbacks; if (mac_capab_get((mac_handle_t)mip, MAC_CAPAB_LEGACY, &mip->mi_capab_legacy)) { mip->mi_state_flags |= MIS_LEGACY; mip->mi_phy_dev = mip->mi_capab_legacy.ml_dev; } else { mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip), mip->mi_minor); } /* * Allocate a notification thread. thread_create blocks for memory * if needed, it never fails. */ mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread, mip, 0, &p0, TS_RUN, minclsyspri); /* * Initialize the capabilities */ bzero(&mip->mi_rx_rings_cap, sizeof (mac_capab_rings_t)); bzero(&mip->mi_tx_rings_cap, sizeof (mac_capab_rings_t)); if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, NULL)) mip->mi_state_flags |= MIS_IS_VNIC; if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL)) mip->mi_state_flags |= MIS_IS_AGGR; mac_addr_factory_init(mip); /* * Enforce the virtrualization level registered. */ if (mip->mi_v12n_level & MAC_VIRT_LEVEL1) { if (mac_init_rings(mip, MAC_RING_TYPE_RX) != 0 || mac_init_rings(mip, MAC_RING_TYPE_TX) != 0) goto fail; /* * The driver needs to register at least rx rings for this * virtualization level. */ if (mip->mi_rx_groups == NULL) goto fail; } /* * The driver must set mc_unicst entry point to NULL when it advertises * CAP_RINGS for rx groups. */ if (mip->mi_rx_groups != NULL) { if (mregp->m_callbacks->mc_unicst != NULL) goto fail; } else { if (mregp->m_callbacks->mc_unicst == NULL) goto fail; } /* * Initialize MAC addresses. Must be called after mac_init_rings(). */ mac_init_macaddr(mip); mip->mi_share_capab.ms_snum = 0; if (mip->mi_v12n_level & MAC_VIRT_HIO) { (void) mac_capab_get((mac_handle_t)mip, MAC_CAPAB_SHARES, &mip->mi_share_capab); } /* * Initialize the kstats for this device. */ mac_driver_stat_create(mip); /* Zero out any properties. */ bzero(&mip->mi_resource_props, sizeof (mac_resource_props_t)); if (mip->mi_minor <= MAC_MAX_MINOR) { /* Create a style-2 DLPI device */ if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0, DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS) goto fail; style2_created = B_TRUE; /* Create a style-1 DLPI device */ if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR, mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS) goto fail; style1_created = B_TRUE; } mac_flow_l2tab_create(mip, &mip->mi_flow_tab); rw_enter(&i_mac_impl_lock, RW_WRITER); if (mod_hash_insert(i_mac_impl_hash, (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) { rw_exit(&i_mac_impl_lock); err = EEXIST; goto fail; } DTRACE_PROBE2(mac__register, struct devnames *, dnp, (mac_impl_t *), mip); /* * Mark the MAC to be ready for open. */ mip->mi_state_flags &= ~MIS_DISABLED; rw_exit(&i_mac_impl_lock); atomic_inc_32(&i_mac_impl_count); cmn_err(CE_NOTE, "!%s registered", mip->mi_name); *mhp = (mac_handle_t)mip; return (0); fail: if (style1_created) ddi_remove_minor_node(mip->mi_dip, mip->mi_name); if (style2_created) ddi_remove_minor_node(mip->mi_dip, driver); mac_addr_factory_fini(mip); /* Clean up registered MAC addresses */ mac_fini_macaddr(mip); /* Clean up registered rings */ mac_free_rings(mip, MAC_RING_TYPE_RX); mac_free_rings(mip, MAC_RING_TYPE_TX); /* Clean up notification thread */ if (mip->mi_notify_thread != NULL) i_mac_notify_exit(mip); if (mip->mi_info.mi_unicst_addr != NULL) { kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length); mip->mi_info.mi_unicst_addr = NULL; } mac_driver_stat_delete(mip); if (mip->mi_type != NULL) { atomic_dec_32(&mip->mi_type->mt_ref); mip->mi_type = NULL; } if (mip->mi_pdata != NULL) { kmem_free(mip->mi_pdata, mip->mi_pdata_size); mip->mi_pdata = NULL; mip->mi_pdata_size = 0; } if (minor != 0) { ASSERT(minor > MAC_MAX_MINOR); mac_minor_rele(minor); } mip->mi_state_flags = 0; mac_unregister_priv_prop(mip); /* * Clear the state before destroying the mac_impl_t */ mip->mi_state_flags = 0; kmem_cache_free(i_mac_impl_cachep, mip); return (err); }
/*ARGSUSED*/ static kmem_cbrc_t zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) { znode_t *ozp = buf, *nzp = newbuf; zfsvfs_t *zfsvfs; vnode_t *vp; /* * The znode is on the file system's list of known znodes if the vfs * pointer is valid. We set the low bit of the vfs pointer when freeing * the znode to invalidate it, and the memory patterns written by kmem * (baddcafe and deadbeef) set at least one of the two low bits. A newly * created znode sets the vfs pointer last of all to indicate that the * znode is known and in a valid state to be moved by this function. */ zfsvfs = ozp->z_zfsvfs; if (!POINTER_IS_VALID(zfsvfs)) { ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); return (KMEM_CBRC_DONT_KNOW); } /* * Close a small window in which it's possible that the filesystem could * be unmounted and freed, and zfsvfs, though valid in the previous * statement, could point to unrelated memory by the time we try to * prevent the filesystem from being unmounted. */ rw_enter(&zfsvfs_lock, RW_WRITER); if (zfsvfs != ozp->z_zfsvfs) { rw_exit(&zfsvfs_lock); ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1); return (KMEM_CBRC_DONT_KNOW); } /* * If the znode is still valid, then so is the file system. We know that * no valid file system can be freed while we hold zfsvfs_lock, so we * can safely ensure that the filesystem is not and will not be * unmounted. The next statement is equivalent to ZFS_ENTER(). */ rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); if (zfsvfs->z_unmounted) { ZFS_EXIT(zfsvfs); rw_exit(&zfsvfs_lock); ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); return (KMEM_CBRC_DONT_KNOW); } rw_exit(&zfsvfs_lock); mutex_enter(&zfsvfs->z_znodes_lock); /* * Recheck the vfs pointer in case the znode was removed just before * acquiring the lock. */ if (zfsvfs != ozp->z_zfsvfs) { mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2); return (KMEM_CBRC_DONT_KNOW); } /* * At this point we know that as long as we hold z_znodes_lock, the * znode cannot be freed and fields within the znode can be safely * accessed. Now, prevent a race with zfs_zget(). */ if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); return (KMEM_CBRC_LATER); } vp = ZTOV(ozp); if (mutex_tryenter(&vp->v_lock) == 0) { ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); return (KMEM_CBRC_LATER); } /* Only move znodes that are referenced _only_ by the DNLC. */ if (vp->v_count != 1 || !vn_in_dnlc(vp)) { mutex_exit(&vp->v_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); return (KMEM_CBRC_LATER); } /* * The znode is known and in a valid state to move. We're holding the * locks needed to execute the critical section. */ zfs_znode_move_impl(ozp, nzp); mutex_exit(&vp->v_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); list_link_replace(&ozp->z_link_node, &nzp->z_link_node); mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); return (KMEM_CBRC_YES); }
/* * Lock a directory entry. A dirlock on <dzp, name> protects that name * in dzp's directory zap object. As long as you hold a dirlock, you can * assume two things: (1) dzp cannot be reaped, and (2) no other thread * can change the zap entry for (i.e. link or unlink) this name. * * Input arguments: * dzp - znode for directory * name - name of entry to lock * flag - ZNEW: if the entry already exists, fail with EEXIST. * ZEXISTS: if the entry does not exist, fail with ENOENT. * ZSHARED: allow concurrent access with other ZSHARED callers. * ZXATTR: we want dzp's xattr directory * ZCILOOK: On a mixed sensitivity file system, * this lookup should be case-insensitive. * ZCIEXACT: On a purely case-insensitive file system, * this lookup should be case-sensitive. * ZRENAMING: we are locking for renaming, force narrow locks * ZHAVELOCK: Don't grab the z_name_lock for this call. The * current thread already holds it. * * Output arguments: * zpp - pointer to the znode for the entry (NULL if there isn't one) * dlpp - pointer to the dirlock for this entry (NULL on error) * direntflags - (case-insensitive lookup only) * flags if multiple case-sensitive matches exist in directory * realpnp - (case-insensitive lookup only) * actual name matched within the directory * * Return value: 0 on success or errno on failure. * * NOTE: Always checks for, and rejects, '.' and '..'. * NOTE: For case-insensitive file systems we take wide locks (see below), * but return znode pointers to a single match. */ int zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, int flag, int *direntflags, pathname_t *realpnp) { zfs_sb_t *zsb = ZTOZSB(dzp); zfs_dirlock_t *dl; boolean_t update; boolean_t exact; uint64_t zoid; #ifdef HAVE_DNLC vnode_t *vp = NULL; #endif /* HAVE_DNLC */ int error = 0; int cmpflags; *zpp = NULL; *dlpp = NULL; /* * Verify that we are not trying to lock '.', '..', or '.zfs' */ if ((name[0] == '.' && (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) || (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)) return (SET_ERROR(EEXIST)); /* * Case sensitivity and normalization preferences are set when * the file system is created. These are stored in the * zsb->z_case and zsb->z_norm fields. These choices * affect what vnodes can be cached in the DNLC, how we * perform zap lookups, and the "width" of our dirlocks. * * A normal dirlock locks a single name. Note that with * normalization a name can be composed multiple ways, but * when normalized, these names all compare equal. A wide * dirlock locks multiple names. We need these when the file * system is supporting mixed-mode access. It is sometimes * necessary to lock all case permutations of file name at * once so that simultaneous case-insensitive/case-sensitive * behaves as rationally as possible. */ /* * Decide if exact matches should be requested when performing * a zap lookup on file systems supporting case-insensitive * access. */ exact = ((zsb->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) || ((zsb->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK)); /* * Only look in or update the DNLC if we are looking for the * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name. * * Maybe can add TO-UPPERed version of name to dnlc in ci-only * case for performance improvement? */ update = !zsb->z_norm || ((zsb->z_case == ZFS_CASE_MIXED) && !(zsb->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); /* * ZRENAMING indicates we are in a situation where we should * take narrow locks regardless of the file system's * preferences for normalizing and case folding. This will * prevent us deadlocking trying to grab the same wide lock * twice if the two names happen to be case-insensitive * matches. */ if (flag & ZRENAMING) cmpflags = 0; else cmpflags = zsb->z_norm; /* * Wait until there are no locks on this name. * * Don't grab the the lock if it is already held. However, cannot * have both ZSHARED and ZHAVELOCK together. */ ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); if (!(flag & ZHAVELOCK)) rw_enter(&dzp->z_name_lock, RW_READER); mutex_enter(&dzp->z_lock); for (;;) { if (dzp->z_unlinked) { mutex_exit(&dzp->z_lock); if (!(flag & ZHAVELOCK)) rw_exit(&dzp->z_name_lock); return (SET_ERROR(ENOENT)); } for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, U8_UNICODE_LATEST, &error) == 0) || error != 0) break; } if (error != 0) { mutex_exit(&dzp->z_lock); if (!(flag & ZHAVELOCK)) rw_exit(&dzp->z_name_lock); return (SET_ERROR(ENOENT)); } if (dl == NULL) { /* * Allocate a new dirlock and add it to the list. */ dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); dl->dl_name = name; dl->dl_sharecnt = 0; dl->dl_namelock = 0; dl->dl_namesize = 0; dl->dl_dzp = dzp; dl->dl_next = dzp->z_dirlocks; dzp->z_dirlocks = dl; break; } if ((flag & ZSHARED) && dl->dl_sharecnt != 0) break; cv_wait(&dl->dl_cv, &dzp->z_lock); } /* * If the z_name_lock was NOT held for this dirlock record it. */ if (flag & ZHAVELOCK) dl->dl_namelock = 1; if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { /* * We're the second shared reference to dl. Make a copy of * dl_name in case the first thread goes away before we do. * Note that we initialize the new name before storing its * pointer into dl_name, because the first thread may load * dl->dl_name at any time. He'll either see the old value, * which is his, or the new shared copy; either is OK. */ dl->dl_namesize = strlen(dl->dl_name) + 1; name = kmem_alloc(dl->dl_namesize, KM_SLEEP); bcopy(dl->dl_name, name, dl->dl_namesize); dl->dl_name = name; } mutex_exit(&dzp->z_lock); /* * We have a dirlock on the name. (Note that it is the dirlock, * not the dzp's z_lock, that protects the name in the zap object.) * See if there's an object by this name; if so, put a hold on it. */ if (flag & ZXATTR) { error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zsb), &zoid, sizeof (zoid)); if (error == 0) error = (zoid == 0 ? SET_ERROR(ENOENT) : 0); } else { #ifdef HAVE_DNLC if (update) vp = dnlc_lookup(ZTOI(dzp), name); if (vp == DNLC_NO_VNODE) { iput(vp); error = SET_ERROR(ENOENT); } else if (vp) { if (flag & ZNEW) { zfs_dirent_unlock(dl); iput(vp); return (SET_ERROR(EEXIST)); } *dlpp = dl; *zpp = VTOZ(vp); return (0); } else { error = zfs_match_find(zsb, dzp, name, exact, update, direntflags, realpnp, &zoid); } #else error = zfs_match_find(zsb, dzp, name, exact, update, direntflags, realpnp, &zoid); #endif /* HAVE_DNLC */ } if (error) { if (error != ENOENT || (flag & ZEXISTS)) { zfs_dirent_unlock(dl); return (error); } } else { if (flag & ZNEW) { zfs_dirent_unlock(dl); return (SET_ERROR(EEXIST)); } error = zfs_zget(zsb, zoid, zpp); if (error) { zfs_dirent_unlock(dl); return (error); } #ifdef HAVE_DNLC if (!(flag & ZXATTR) && update) dnlc_update(ZTOI(dzp), name, ZTOI(*zpp)); #endif /* HAVE_DNLC */ } *dlpp = dl; return (0); }
int ud_dirrename(struct ud_inode *sdp, struct ud_inode *sip, struct ud_inode *tdp, struct ud_inode *tip, char *namep, uint8_t *buf, struct slot *slotp, struct cred *cr) { int32_t error = 0, doingdirectory; struct file_id *fid; ud_printf("ud_dirrename\n"); ASSERT(sdp->i_udf != NULL); ASSERT(MUTEX_HELD(&sdp->i_udf->udf_rename_lck)); ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); ASSERT(buf); ASSERT(slotp->ep); fid = slotp->ep; /* * Short circuit rename of something to itself. */ if (sip->i_icb_lbano == tip->i_icb_lbano) { return (ESAME); /* special KLUDGE error code */ } /* * Everything is protected under the vfs_rename_lock so the ordering * of i_contents locks doesn't matter here. */ rw_enter(&sip->i_contents, RW_READER); rw_enter(&tip->i_contents, RW_READER); /* * Check that everything is on the same filesystem. */ if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) || (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) { error = EXDEV; /* XXX archaic */ goto out; } /* * Must have write permission to rewrite target entry. */ if ((error = ud_iaccess(tdp, IWRITE, cr)) != 0 || (error = ud_sticky_remove_access(tdp, tip, cr)) != 0) goto out; /* * Ensure source and target are compatible (both directories * or both not directories). If target is a directory it must * be empty and have no links to it; in addition it must not * be a mount point, and both the source and target must be * writable. */ doingdirectory = (sip->i_type == VDIR); if (tip->i_type == VDIR) { if (!doingdirectory) { error = EISDIR; goto out; } /* * vn_vfswlock will prevent mounts from using the directory * until we are done. */ if (vn_vfswlock(ITOV(tip))) { error = EBUSY; goto out; } if (vn_mountedvfs(ITOV(tip)) != NULL) { vn_vfsunlock(ITOV(tip)); error = EBUSY; goto out; } if (!ud_dirempty(tip, tdp->i_uniqid, cr) || tip->i_nlink > 2) { vn_vfsunlock(ITOV(tip)); error = EEXIST; /* SIGH should be ENOTEMPTY */ goto out; } } else if (doingdirectory) { error = ENOTDIR; goto out; } /* * Rewrite the inode pointer for target name entry * from the target inode (ip) to the source inode (sip). * This prevents the target entry from disappearing * during a crash. Mark the directory inode to reflect the changes. */ dnlc_remove(ITOV(tdp), namep); fid->fid_icb.lad_ext_prn = SWAP_16(sip->i_icb_prn); fid->fid_icb.lad_ext_loc = SWAP_32(sip->i_icb_block); dnlc_enter(ITOV(tdp), namep, ITOV(sip)); ud_make_tag(tdp->i_udf, &fid->fid_tag, UD_FILE_ID_DESC, SWAP_32(fid->fid_tag.tag_loc), FID_LEN(fid)); error = ud_write_fid(tdp, slotp, buf); if (error) { if (doingdirectory) { vn_vfsunlock(ITOV(tip)); } goto out; } /* * Upgrade to write lock on tip */ rw_exit(&tip->i_contents); rw_enter(&tip->i_contents, RW_WRITER); mutex_enter(&tdp->i_tlock); tdp->i_flag |= IUPD|ICHG; mutex_exit(&tdp->i_tlock); /* * Decrement the link count of the target inode. * Fix the ".." entry in sip to point to dp. * This is done after the new entry is on the disk. */ tip->i_nlink--; mutex_enter(&tip->i_tlock); tip->i_flag |= ICHG; mutex_exit(&tip->i_tlock); if (doingdirectory) { /* * The entry for tip no longer exists so I can unlock the * vfslock. */ vn_vfsunlock(ITOV(tip)); /* * Decrement target link count once more if it was a directory. */ if (tip->i_nlink != 0) { cmn_err(CE_WARN, "ud_direnter: target directory link count != 0"); rw_exit(&tip->i_contents); rw_exit(&sip->i_contents); return (EINVAL); } /* * Renaming a directory with the parent different * requires that ".." be rewritten. The window is * still there for ".." to be inconsistent, but this * is unavoidable, and a lot shorter than when it was * done in a user process. We decrement the link * count in the new parent as appropriate to reflect * the just-removed target. If the parent is the * same, this is appropriate since the original * directory is going away. If the new parent is * different, dirfixdotdot() will bump the link count * back. */ tdp->i_nlink--; mutex_enter(&tdp->i_tlock); tdp->i_flag |= ICHG; mutex_exit(&tdp->i_tlock); ITIMES_NOLOCK(tdp); if (sdp != tdp) { rw_exit(&tip->i_contents); rw_exit(&sip->i_contents); error = ud_dirfixdotdot(sip, sdp, tdp); return (error); } } out: rw_exit(&tip->i_contents); rw_exit(&sip->i_contents); return (error); }
/* * Fix the FID_PARENT entry of the child directory so that it points * to the new parent directory instead of the old one. Routine * assumes that dp is a directory and that all the inodes are on * the same file system. */ int ud_dirfixdotdot(struct ud_inode *dp, struct ud_inode *opdp, struct ud_inode *npdp) { int32_t err = 0; struct fbuf *fbp; struct file_id *fid; uint32_t loc, dummy, tbno; ud_printf("ud_dirfixdotdot\n"); ASSERT(opdp->i_type == VDIR); ASSERT(npdp->i_type == VDIR); ASSERT(RW_WRITE_HELD(&npdp->i_rwlock)); err = fbread(ITOV(dp), (offset_t)0, dp->i_udf->udf_lbsize, S_WRITE, &fbp); if (err || dp->i_nlink == 0 || dp->i_size < sizeof (struct file_id)) { goto bad; } if ((err = ud_ip_off2bno(dp, 0, &tbno)) != 0) { goto bad; } fid = (struct file_id *)fbp->fb_addr; if ((ud_verify_tag_and_desc(&fid->fid_tag, UD_FILE_ID_DESC, tbno, 1, dp->i_udf->udf_lbsize) != 0) || ((fid->fid_flags & (FID_DIR | FID_PARENT)) != (FID_DIR | FID_PARENT))) { err = ENOTDIR; goto bad; } loc = ud_xlate_to_daddr(dp->i_udf, SWAP_16(fid->fid_icb.lad_ext_prn), SWAP_32(fid->fid_icb.lad_ext_loc), 1, &dummy); ASSERT(dummy == 1); if (loc == npdp->i_icb_lbano) { goto bad; } /* * Increment the link count in the new parent inode and force it out. */ if (npdp->i_nlink == MAXLINK) { err = EMLINK; goto bad; } npdp->i_nlink++; mutex_enter(&npdp->i_tlock); npdp->i_flag |= ICHG; mutex_exit(&npdp->i_tlock); ud_iupdat(npdp, 1); /* * Rewrite the child FID_PARENT entry and force it out. */ dnlc_remove(ITOV(dp), ".."); fid->fid_icb.lad_ext_loc = SWAP_32(npdp->i_icb_block); fid->fid_icb.lad_ext_prn = SWAP_16(npdp->i_icb_prn); ud_make_tag(npdp->i_udf, &fid->fid_tag, UD_FILE_ID_DESC, tbno, FID_LEN(fid)); dnlc_enter(ITOV(dp), "..", ITOV(npdp)); err = ud_fbwrite(fbp, dp); fbp = NULL; if (err != 0) { goto bad; } /* * Decrement the link count of the old parent inode and force * it out. If opdp is NULL, then this is a new directory link; * it has no parent, so we need not do anything. */ if (opdp != NULL) { rw_enter(&opdp->i_contents, RW_WRITER); if (opdp->i_nlink != 0) { opdp->i_nlink--; mutex_enter(&opdp->i_tlock); opdp->i_flag |= ICHG; mutex_exit(&opdp->i_tlock); ud_iupdat(opdp, 1); } rw_exit(&opdp->i_contents); } return (0); bad: if (fbp) { fbrelse(fbp, S_OTHER); } return (err); }
int ud_dirlook(struct ud_inode *dip, char *namep, struct ud_inode **ipp, struct cred *cr, int32_t skipdnlc) { struct udf_vfs *udf_vfsp; int32_t error = 0, namelen, adhoc_search; u_offset_t offset, adhoc_offset, dirsize, end; struct vnode *dvp, *vp; struct fbuf *fbp; struct file_id *fid; uint8_t *fname, dummy[3]; int32_t id_len, doingchk; uint32_t old_loc; uint16_t old_prn; uint8_t *dname; uint8_t *buf = NULL; ud_printf("ud_dirlook\n"); udf_vfsp = dip->i_udf; restart: doingchk = 0; old_prn = 0xFFFF; old_loc = 0; dvp = ITOV(dip); /* * Check accessibility of directory. */ if (dip->i_type != VDIR) { return (ENOTDIR); } if (error = ud_iaccess(dip, IEXEC, cr)) { return (error); } /* * Null component name is synonym for directory being searched. */ if (*namep == '\0') { VN_HOLD(dvp); *ipp = dip; return (0); } namelen = strlen(namep); if ((namelen == 1) && (namep[0] == '.') && (namep[1] == '\0')) { /* Current directory */ VN_HOLD(dvp); *ipp = dip; dnlc_enter(dvp, namep, ITOV(*ipp)); return (0); } if ((!skipdnlc) && (vp = dnlc_lookup(dvp, namep))) { /* vp is already held from dnlc_lookup */ *ipp = VTOI(vp); return (0); } dname = kmem_zalloc(1024, KM_SLEEP); buf = kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP); /* * Read lock the inode we are searching. You will notice that we * didn't hold the read lock while searching the dnlc. This means * that the entry could now be in the dnlc. This doesn't cause any * problems because dnlc_enter won't add an entry if it is already * there. */ rw_enter(&dip->i_rwlock, RW_READER); /* * Take care to look at dip->i_diroff only once, as it * may be changing due to other threads/cpus. */ recheck: offset = dip->i_diroff; end = dirsize = dip->i_size; if (offset > dirsize) { offset = 0; } adhoc_offset = offset; adhoc_search = (offset == 0) ? 1 : 2; fbp = NULL; while (adhoc_search--) { while (offset < end) { error = ud_get_next_fid(dip, &fbp, offset, &fid, &fname, buf); if (error != 0) { break; } if ((fid->fid_flags & FID_DELETED) == 0) { if (fid->fid_flags & FID_PARENT) { id_len = 2; fname = dummy; dummy[0] = '.'; dummy[1] = '.'; dummy[2] = '\0'; } else { if ((error = ud_uncompress( fid->fid_idlen, &id_len, fname, dname)) != 0) { break; } fname = (uint8_t *)dname; fname[id_len] = '\0'; } if ((namelen == id_len) && (strncmp(namep, (caddr_t)fname, namelen) == 0)) { uint32_t loc; uint16_t prn; loc = SWAP_32(fid->fid_icb.lad_ext_loc); prn = SWAP_16(fid->fid_icb.lad_ext_prn); dip->i_diroff = offset + FID_LEN(fid); if (doingchk) { if ((loc == old_loc) && (prn == old_prn)) { goto checkok; } else { if (fbp != NULL) { fbrelse(fbp, S_READ); fbp = NULL; } VN_RELE(ITOV(*ipp)); rw_exit(&dip->i_rwlock); goto restart; } /* NOTREACHED */ } if (namelen == 2 && fname[0] == '.' && fname[1] == '.') { struct timespec32 omtime; omtime = dip->i_mtime; rw_exit(&dip->i_rwlock); error = ud_iget(dip->i_vfs, prn, loc, ipp, NULL, cr); rw_enter(&dip->i_rwlock, RW_READER); if (error) { goto done; } if ((omtime.tv_sec != dip->i_mtime.tv_sec) || (omtime.tv_nsec != dip->i_mtime.tv_nsec)) { doingchk = 1; old_prn = prn; old_loc = loc; dip->i_diroff = 0; if (fbp != NULL) { fbrelse(fbp, S_READ); fbp = NULL; } goto recheck; } } else { error = ud_iget(dip->i_vfs, prn, loc, ipp, NULL, cr); } checkok: if (error == 0) { dnlc_enter(dvp, namep, ITOV(*ipp)); } goto done; } } offset += FID_LEN(fid); } if (fbp != NULL) { fbrelse(fbp, S_READ); fbp = NULL; } end = adhoc_offset; offset = 0; } error = ENOENT; done: kmem_free(buf, udf_vfsp->udf_lbsize); kmem_free(dname, 1024); if (fbp != NULL) { fbrelse(fbp, S_READ); } rw_exit(&dip->i_rwlock); return (error); }
/* ARGSUSED2 */ int ud_dirmakedirect(struct ud_inode *ip, struct ud_inode *dp, struct cred *cr) { int32_t err; uint32_t blkno, size, parent_len, tbno; struct fbuf *fbp; struct file_id *fid; struct icb_ext *iext; ud_printf("ud_dirmakedirect\n"); ASSERT(RW_WRITE_HELD(&ip->i_contents)); ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); parent_len = sizeof (struct file_id); if ((ip->i_desc_type != ICB_FLAG_ONE_AD) || (parent_len > ip->i_max_emb)) { ASSERT(ip->i_ext); /* * Allocate space for the directory we're creating. */ if ((err = ud_alloc_space(ip->i_vfs, ip->i_icb_prn, 0, 1, &blkno, &size, 0, 0)) != 0) { return (err); } /* * init with the size of * directory with just the * parent */ ip->i_size = sizeof (struct file_id); ip->i_flag |= IUPD|ICHG|IATTCHG; iext = ip->i_ext; iext->ib_prn = ip->i_icb_prn; iext->ib_block = blkno; iext->ib_count = ip->i_size; iext->ib_offset = 0; ip->i_ext_used = 1; } else { ip->i_size = sizeof (struct file_id); ip->i_flag |= IUPD|ICHG|IATTCHG; } ITIMES_NOLOCK(ip); /* * Update the dp link count and write out the change. * This reflects the ".." entry we'll soon write. */ if (dp->i_nlink == MAXLINK) { return (EMLINK); } dp->i_nlink++; dp->i_flag |= ICHG; ud_iupdat(dp, 1); /* * Initialize directory with ".." * Since the parent directory is locked, we don't have to * worry about anything changing when we drop the write * lock on (ip). */ rw_exit(&ip->i_contents); if ((err = fbread(ITOV(ip), (offset_t)0, ip->i_udf->udf_lbsize, S_WRITE, &fbp)) != 0) { rw_enter(&ip->i_contents, RW_WRITER); return (err); } bzero(fbp->fb_addr, ip->i_udf->udf_lbsize); fid = (struct file_id *)fbp->fb_addr; fid->fid_ver = SWAP_16(1); fid->fid_flags = FID_DIR | FID_PARENT; fid->fid_icb.lad_ext_len = SWAP_32(dp->i_udf->udf_lbsize); fid->fid_icb.lad_ext_loc = SWAP_32(dp->i_icb_block); fid->fid_icb.lad_ext_prn = SWAP_16(dp->i_icb_prn); /* * fid_idlen, fid_iulen and fid_spec are zero * due to bzero above */ if ((err = ud_ip_off2bno(ip, 0, &tbno)) == 0) { ud_make_tag(ip->i_udf, &fid->fid_tag, UD_FILE_ID_DESC, tbno, FID_LEN(fid)); } err = ud_fbwrite(fbp, ip); rw_enter(&ip->i_contents, RW_WRITER); return (err); }
int ud_dircheckpath(int32_t blkno, struct ud_inode *target, struct cred *cr) { int32_t err = 0; struct vfs *vfsp; struct udf_vfs *udf_vfsp; struct fbuf *fbp; struct file_id *fid; struct ud_inode *ip, *tip; uint16_t prn; uint32_t lbno, dummy, tbno; daddr_t parent_icb_loc; ud_printf("ud_dircheckpath\n"); udf_vfsp = target->i_udf; ip = target; ASSERT(udf_vfsp != NULL); ASSERT(MUTEX_HELD(&target->i_udf->udf_rename_lck)); ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); if (ip->i_icb_lbano == blkno) { err = EINVAL; goto out; } if (ip->i_icb_lbano == udf_vfsp->udf_root_blkno) { goto out; } /* * Search back through the directory tree, using the PARENT entries * Fail any attempt to move a directory into an ancestor directory. */ for (;;) { if ((err = fbread(ITOV(ip), 0, udf_vfsp->udf_lbsize, S_READ, &fbp)) != 0) { break; } if ((err = ud_ip_off2bno(ip, 0, &tbno)) != 0) { break; } fid = (struct file_id *)fbp->fb_addr; /* IS this a valid file_identifier */ if (ud_verify_tag_and_desc(&fid->fid_tag, UD_FILE_ID_DESC, tbno, 1, udf_vfsp->udf_lbsize) != 0) { break; } if ((fid->fid_flags & FID_DELETED) != 0) { break; } if ((fid->fid_flags & FID_PARENT) == 0) { /* * This cannot happen unless * something is grossly wrong * First entry has to be parent */ break; } prn = SWAP_16(fid->fid_icb.lad_ext_prn); lbno = SWAP_32(fid->fid_icb.lad_ext_loc); parent_icb_loc = ud_xlate_to_daddr(udf_vfsp, prn, lbno, 1, &dummy); ASSERT(dummy == 1); if (parent_icb_loc == blkno) { err = EINVAL; break; } vfsp = ip->i_vfs; udf_vfsp = ip->i_udf; if (parent_icb_loc == udf_vfsp->udf_root_blkno) { break; } if (fbp != NULL) { fbrelse(fbp, S_OTHER); fbp = NULL; } if (ip != target) { rw_exit(&ip->i_rwlock); VN_RELE(ITOV(ip)); } /* * Race to get the inode. */ if (err = ud_iget(vfsp, prn, lbno, &tip, NULL, cr)) { ip = NULL; break; } ip = tip; rw_enter(&ip->i_rwlock, RW_READER); } if (fbp) { fbrelse(fbp, S_OTHER); } out: if (ip) { if (ip != target) { rw_exit(&ip->i_rwlock); VN_RELE(ITOV(ip)); } } return (err); }
/* * do a simple estimate of the space needed to hold the statefile * taking compression into account, but be fairly conservative * so we have a better chance of completing; when dump fails, * the retry cost is fairly high. * * Do disk blocks allocation for the state file if no space has * been allocated yet. Since the state file will not be removed, * allocation should only be done once. */ static int cpr_statefile_ok(vnode_t *vp, int alloc_retry) { extern size_t cpr_bitmap_size; struct inode *ip = VTOI(vp); const int UCOMP_RATE = 20; /* comp. ratio*10 for user pages */ u_longlong_t size, isize, ksize, raw_data; char *str, *est_fmt; size_t space; int error; /* * number of pages short for swapping. */ STAT->cs_nosw_pages = k_anoninfo.ani_mem_resv; if (STAT->cs_nosw_pages < 0) STAT->cs_nosw_pages = 0; str = "cpr_statefile_ok:"; CPR_DEBUG(CPR_DEBUG9, "Phys swap: max=%lu resv=%lu\n", k_anoninfo.ani_max, k_anoninfo.ani_phys_resv); CPR_DEBUG(CPR_DEBUG9, "Mem swap: max=%ld resv=%lu\n", MAX(availrmem - swapfs_minfree, 0), k_anoninfo.ani_mem_resv); CPR_DEBUG(CPR_DEBUG9, "Total available swap: %ld\n", CURRENT_TOTAL_AVAILABLE_SWAP); /* * try increasing filesize by 15% */ if (alloc_retry) { /* * block device doesn't get any bigger */ if (vp->v_type == VBLK) { if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) prom_printf( "Retry statefile on special file\n"); return (ENOMEM); } else { rw_enter(&ip->i_contents, RW_READER); size = (ip->i_size * SIZE_RATE) / INTEGRAL; rw_exit(&ip->i_contents); } if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) prom_printf("Retry statefile size = %lld\n", size); } else { u_longlong_t cpd_size; pgcnt_t npages, nback; int ndvram; ndvram = 0; (void) callb_execute_class(CB_CL_CPR_FB, (int)(uintptr_t)&ndvram); if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) prom_printf("ndvram size = %d\n", ndvram); /* * estimate 1 cpd_t for every (CPR_MAXCONTIG / 2) pages */ npages = cpr_count_kpages(REGULAR_BITMAP, cpr_nobit); cpd_size = sizeof (cpd_t) * (npages / (CPR_MAXCONTIG / 2)); raw_data = cpd_size + cpr_bitmap_size; ksize = ndvram + mmu_ptob(npages); est_fmt = "%s estimated size with " "%scompression %lld, ksize %lld\n"; nback = mmu_ptob(STAT->cs_nosw_pages); if (CPR->c_flags & C_COMPRESSING) { size = ((ksize * COMPRESS_PERCENT) / INTEGRAL) + raw_data + ((nback * 10) / UCOMP_RATE); CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "", size, ksize); } else { size = ksize + raw_data + nback; CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "no ", size, ksize); } } /* * All this is much simpler for a block device */ if (vp->v_type == VBLK) { space = cpr_get_devsize(vp->v_rdev); if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) prom_printf("statefile dev size %lu\n", space); /* * Export the estimated filesize info, this value will be * compared before dumping out the statefile in the case of * no compression. */ STAT->cs_est_statefsz = size; if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) prom_printf("%s Estimated statefile size %llu, " "space %lu\n", str, size, space); if (size > space) { cpr_err(CE_CONT, "Statefile partition too small."); return (ENOMEM); } return (0); } else { if (CPR->c_alloc_cnt++ > C_MAX_ALLOC_RETRY) { cpr_err(CE_CONT, "Statefile allocation retry failed\n"); return (ENOMEM); } /* * Estimate space needed for the state file. * * State file size in bytes: * kernel size + non-cache pte seg + * bitmap size + cpr state file headers size * (round up to fs->fs_bsize) */ size = blkroundup(ip->i_fs, size); /* * Export the estimated filesize info, this value will be * compared before dumping out the statefile in the case of * no compression. */ STAT->cs_est_statefsz = size; error = cpr_grow_statefile(vp, size); if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) { rw_enter(&ip->i_contents, RW_READER); isize = ip->i_size; rw_exit(&ip->i_contents); prom_printf("%s Estimated statefile size %lld, " "i_size %lld\n", str, size, isize); } return (error); } }
/* * Restart queuing for high priority message of read stream * when flow control failed * * Requires Lock (( M: Mandatory, P: Prohibited, A: Allowed )) * -. uinst_t->lock : P * -. uinst_t->u_lock : P * -. uinst_t->l_lock : P * -. uinst_t->c_lock : P */ void oplmsu_rcmn_high_qenable(queue_t *q) { mblk_t *mp; struct iocblk *iocp = NULL; lpath_t *lpath; int rval; rw_enter(&oplmsu_uinst->lock, RW_READER); for (;;) { /* Handle high priority message */ mutex_enter(&oplmsu_uinst->l_lock); lpath = (lpath_t *)q->q_ptr; if ((mp = lpath->first_lpri_hi) == NULL) { mutex_exit(&oplmsu_uinst->l_lock); break; } if (mp->b_next == NULL) { lpath->first_lpri_hi = NULL; lpath->last_lpri_hi = NULL; } else { lpath->first_lpri_hi = mp->b_next; mp->b_next->b_prev = NULL; mp->b_next = NULL; } mp->b_prev = NULL; mutex_exit(&oplmsu_uinst->l_lock); rval = SUCCESS; switch (mp->b_datap->db_type) { case M_IOCACK : /* FALLTHRU */ case M_IOCNAK : iocp = (struct iocblk *)mp->b_rptr; switch (iocp->ioc_cmd) { case TCSETS : /* FALLTHRU */ case TCSETSW : /* FALLTHRU */ case TCSETSF : /* FALLTHRU */ case TIOCMSET : /* FALLTHRU */ case TIOCSPPS : /* FALLTHRU */ case TIOCSWINSZ : /* FALLTHRU */ case TIOCSSOFTCAR : rw_exit(&oplmsu_uinst->lock); rval = oplmsu_lrioctl_termios(q, mp); rw_enter(&oplmsu_uinst->lock, RW_WRITER); break; default : rval = oplmsu_rcmn_through_hndl( q, mp, MSU_HIGH); if (rval == FAILURE) { rw_exit(&oplmsu_uinst->lock); return; } } break; case M_ERROR : rw_exit(&oplmsu_uinst->lock); rval = oplmsu_lrmsg_error(q, mp); rw_enter(&oplmsu_uinst->lock, RW_WRITER); break; case M_FLUSH : oplmsu_rcmn_flush_hndl(q, mp); break; default : rval = oplmsu_rcmn_through_hndl(q, mp, MSU_HIGH); if (rval == FAILURE) { rw_exit(&oplmsu_uinst->lock); return; } } if (rval == FAILURE) { break; } } rw_exit(&oplmsu_uinst->lock); qenable(q); /* Enable lower read queue */ }
/*ARGSUSED*/ static int rds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) { rds_t *rds; int ret; if (is_system_labeled()) { /* * RDS socket is not supported on labeled systems */ return (ESOCKTNOSUPPORT); } /* Open the transport driver if IB HW is present */ rw_enter(&rds_transport_lock, RW_READER); if (rds_transport_handle == NULL) { rw_exit(&rds_transport_lock); ret = rds_open_transport_driver(); rw_enter(&rds_transport_lock, RW_READER); if (ret != 0) { /* Transport driver failed to load */ rw_exit(&rds_transport_lock); return (ret); } } rw_exit(&rds_transport_lock); if (sflag == MODOPEN) { return (EINVAL); } /* Reopen not supported */ if (q->q_ptr != NULL) { dprint(2, ("%s: Reopen is not supported: %p", LABEL, q->q_ptr)); return (0); } rds = rds_create(q, credp); if (rds == NULL) { dprint(2, ("%s: rds_create failed", LABEL)); return (0); } q->q_ptr = WR(q)->q_ptr = rds; rds->rds_state = TS_UNBND; rds->rds_family = AF_INET_OFFLOAD; q->q_hiwat = rds_recv_hiwat; q->q_lowat = rds_recv_lowat; qprocson(q); WR(q)->q_hiwat = rds_xmit_hiwat; WR(q)->q_lowat = rds_xmit_lowat; /* Set the Stream head watermarks */ (void) proto_set_rx_hiwat(q, NULL, rds_recv_hiwat); (void) proto_set_rx_lowat(q, NULL, rds_recv_lowat); return (0); }
int ud_direnter( struct ud_inode *tdp, char *namep, enum de_op op, struct ud_inode *sdp, struct ud_inode *sip, struct vattr *vap, struct ud_inode **ipp, struct cred *cr, caller_context_t *ctp) { struct udf_vfs *udf_vfsp; struct ud_inode *tip; struct slot slot; int32_t namlen, err; char *s; uint8_t *buf = NULL; ud_printf("ud_direnter\n"); udf_vfsp = tdp->i_udf; /* don't allow '/' characters in pathname component */ for (s = namep, namlen = 0; *s; s++, namlen++) { if (*s == '/') { return (EACCES); } } if (namlen == 0) { cmn_err(CE_WARN, "name length == 0 in ud_direnter"); return (EINVAL); } ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); /* * If name is "." or ".." then if this is a create look it up * and return EEXIST. Rename or link TO "." or ".." is forbidden. */ if (namep[0] == '.' && (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { if (op == DE_RENAME) { return (EINVAL); /* *SIGH* should be ENOTEMPTY */ } if (ipp) { /* * ud_dirlook will acquire the i_rwlock */ rw_exit(&tdp->i_rwlock); if (err = ud_dirlook(tdp, namep, ipp, cr, 0)) { rw_enter(&tdp->i_rwlock, RW_WRITER); return (err); } rw_enter(&tdp->i_rwlock, RW_WRITER); } return (EEXIST); } tip = NULL; slot.status = NONE; slot.offset = 0; slot.size = 0; slot.fbp = NULL; slot.ep = NULL; slot.endoff = 0; /* * For link and rename lock the source entry and check the link count * to see if it has been removed while it was unlocked. If not, we * increment the link count and force the inode to disk to make sure * that it is there before any directory entry that points to it. */ if (op == DE_LINK || op == DE_RENAME) { rw_enter(&sip->i_contents, RW_WRITER); if (sip->i_nlink == 0) { rw_exit(&sip->i_contents); return (ENOENT); } if (sip->i_nlink == MAXLINK) { rw_exit(&sip->i_contents); return (EMLINK); } sip->i_nlink++; mutex_enter(&sip->i_tlock); sip->i_flag |= ICHG; mutex_exit(&sip->i_tlock); ud_iupdat(sip, 1); rw_exit(&sip->i_contents); } /* * If target directory has not been removed, then we can consider * allowing file to be created. */ if (tdp->i_nlink == 0) { err = ENOENT; goto out2; } /* * Check accessibility of directory. */ if (tdp->i_type != VDIR) { err = ENOTDIR; goto out2; } /* * Execute access is required to search the directory. */ if (err = ud_iaccess(tdp, IEXEC, cr)) { goto out2; } /* * If this is a rename of a directory and the parent is * different (".." must be changed), then the source * directory must not be in the directory hierarchy * above the target, as this would orphan everything * below the source directory. Also the user must have * write permission in the source so as to be able to * change "..". */ if (op == DE_RENAME) { if (sip == tdp) { err = EINVAL; goto out2; } rw_enter(&sip->i_contents, RW_READER); if ((sip->i_type == VDIR) && (sdp != tdp)) { uint32_t blkno; if ((err = ud_iaccess(sip, IWRITE, cr))) { rw_exit(&sip->i_contents); goto out2; } blkno = sip->i_icb_lbano; rw_exit(&sip->i_contents); if ((err = ud_dircheckpath(blkno, tdp, cr))) { goto out2; } } else { rw_exit(&sip->i_contents); } } /* * Search for the entry. Return VN_HELD tip if found. */ buf = kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP); rw_enter(&tdp->i_contents, RW_WRITER); if (err = ud_dircheckforname(tdp, namep, namlen, &slot, &tip, buf, cr)) { goto out; } if (tip) { switch (op) { case DE_CREATE : case DE_MKDIR : if (ipp) { *ipp = tip; err = EEXIST; } else { VN_RELE(ITOV(tip)); } break; case DE_RENAME : err = ud_dirrename(sdp, sip, tdp, tip, namep, buf, &slot, cr); /* * We used to VN_RELE() here, but this * was moved down so that we could send * a vnevent after the locks were dropped. */ break; case DE_LINK : /* * Can't link to an existing file. */ VN_RELE(ITOV(tip)); err = EEXIST; break; } } else { /* * The entry does not exist. Check write permission in * directory to see if entry can be created. */ if (err = ud_iaccess(tdp, IWRITE, cr)) { goto out; } if ((op == DE_CREATE) || (op == DE_MKDIR)) { /* * Make new inode and directory entry as required. */ if (err = ud_dirmakeinode(tdp, &sip, vap, op, cr)) goto out; } if (err = ud_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp, cr)) { if ((op == DE_CREATE) || (op == DE_MKDIR)) { /* * Unmake the inode we just made. */ rw_enter(&sip->i_contents, RW_WRITER); if (sip->i_type == VDIR) { tdp->i_nlink--; } sip->i_nlink = 0; mutex_enter(&sip->i_tlock); sip->i_flag |= ICHG; mutex_exit(&sip->i_tlock); rw_exit(&sip->i_contents); VN_RELE(ITOV(sip)); sip = NULL; } } else if (ipp) { *ipp = sip; } else if ((op == DE_CREATE) || (op == DE_MKDIR)) { VN_RELE(ITOV(sip)); } } out: if (buf != NULL) { kmem_free(buf, udf_vfsp->udf_lbsize); } if (slot.fbp) { fbrelse(slot.fbp, S_OTHER); } rw_exit(&tdp->i_contents); if (op == DE_RENAME) { /* * If it's all good, send events after locks are dropped * but before vnodes are released. */ if (err == 0) { if (tip) { vnevent_rename_dest(ITOV(tip), ITOV(tdp), namep, ctp); } if (sdp != tdp) { vnevent_rename_dest_dir(ITOV(tdp), ctp); } } /* * The following VN_RELE() was moved from the * DE_RENAME case above */ if (tip) { VN_RELE(ITOV(tip)); } } out2: if (err && ((op == DE_LINK) || (op == DE_RENAME))) { /* * Undo bumped link count. */ rw_enter(&sip->i_contents, RW_WRITER); sip->i_nlink--; rw_exit(&sip->i_contents); mutex_enter(&sip->i_tlock); sip->i_flag |= ICHG; mutex_exit(&sip->i_tlock); } return (err); }
void dmu_objset_evict(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; int t; for (t = 0; t < TXG_SIZE; t++) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { if (!dsl_dataset_is_snapshot(ds)) { VERIFY(0 == dsl_prop_unregister(ds, "checksum", checksum_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "compression", compression_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "copies", copies_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "dedup", dedup_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "logbias", logbias_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "sync", sync_changed_cb, os)); } VERIFY(0 == dsl_prop_unregister(ds, "primarycache", primary_cache_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", secondary_cache_changed_cb, os)); } if (os->os_sa) sa_tear_down(os); /* * We should need only a single pass over the dnode list, since * nothing can be added to the list at this point. */ (void) dmu_objset_evict_dbufs(os); dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { dnode_special_close(&os->os_userused_dnode); dnode_special_close(&os->os_groupused_dnode); } zil_free(os->os_zil); ASSERT3P(list_head(&os->os_dnodes), ==, NULL); VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); /* * This is a barrier to prevent the objset from going away in * dnode_move() until we can safely ensure that the objset is still in * use. We consider the objset valid before the barrier and invalid * after the barrier. */ rw_enter(&os_lock, RW_READER); rw_exit(&os_lock); mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); kmem_free(os, sizeof (objset_t)); }
/* * Locking i_contents in this * function seems to be really weird */ int ud_dirremove( struct ud_inode *dp, char *namep, struct ud_inode *oip, struct vnode *cdir, enum dr_op op, struct cred *cr, caller_context_t *ctp) { struct udf_vfs *udf_vfsp; int32_t namelen, err = 0; struct slot slot; struct ud_inode *ip; mode_t mode; struct file_id *fid; uint8_t *buf = NULL; uint32_t tbno; ud_printf("ud_dirremove\n"); ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); udf_vfsp = dp->i_udf; namelen = (int)strlen(namep); if (namelen == 0) { cmn_err(CE_WARN, "name length == 0 in ud_dirremove"); return (EINVAL); } /* * return err when removing . and .. */ if (namep[0] == '.') { if (namelen == 1) { return (EINVAL); } else if (namelen == 2 && namep[1] == '.') { return (EEXIST); /* SIGH should be ENOTEMPTY */ } } ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); /* * Check accessibility of directory. */ if (dp->i_type != VDIR) { return (ENOTDIR); } ip = NULL; slot.status = FOUND; /* don't need to look for empty slot */ slot.offset = 0; slot.size = 0; slot.fbp = NULL; slot.ep = NULL; slot.endoff = 0; /* * Execute access is required to search the directory. * Access for write is interpreted as allowing * deletion of files in the directory. */ if (err = ud_iaccess(dp, IEXEC|IWRITE, cr)) { return (err); } buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP); rw_enter(&dp->i_contents, RW_WRITER); if (err = ud_dircheckforname(dp, namep, namelen, &slot, &ip, buf, cr)) { goto out_novfs; } if (ip == NULL) { err = ENOENT; goto out_novfs; } if (oip && oip != ip) { err = ENOENT; goto out_novfs; } if ((mode = ip->i_type) == VDIR) { /* * vn_vfswlock() prevents races between mount and rmdir. */ if (vn_vfswlock(ITOV(ip))) { err = EBUSY; goto out_novfs; } if (vn_mountedvfs(ITOV(ip)) != NULL && op != DR_RENAME) { err = EBUSY; goto out; } /* * If we are removing a directory, get a lock on it. * If the directory is empty, it will stay empty until * we can remove it. */ rw_enter(&ip->i_rwlock, RW_READER); } /* We must be holding i_contents */ rw_enter(&ip->i_contents, RW_READER); if (err = ud_sticky_remove_access(dp, ip, cr)) { rw_exit(&ip->i_contents); if (mode == VDIR) { rw_exit(&ip->i_rwlock); } goto out; } if (op == DR_RMDIR) { /* * For rmdir(2), some special checks are required. * (a) Don't remove any alias of the parent (e.g. "."). * (b) Don't remove the current directory. * (c) Make sure the entry is (still) a directory. * (d) Make sure the directory is empty. */ if (dp == ip || ITOV(ip) == cdir) { err = EINVAL; } else if (ip->i_type != VDIR) { err = ENOTDIR; } else if ((ip->i_nlink != 1) || (!ud_dirempty(ip, dp->i_uniqid, cr))) { /* * Directories do not have an * entry for "." so only one link * will be there */ err = EEXIST; /* SIGH should be ENOTEMPTY */ } if (err) { rw_exit(&ip->i_contents); if (mode == VDIR) { rw_exit(&ip->i_rwlock); } goto out; } } else if (op == DR_REMOVE) { /* * unlink(2) requires a different check: allow only * privileged processes to unlink a directory. */ struct vnode *vp = ITOV(ip); if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, vp->v_vfsp)) { err = EPERM; rw_exit(&ip->i_contents); rw_exit(&ip->i_rwlock); goto out; } } rw_exit(&ip->i_contents); /* * Remove the cache'd entry, if any. */ dnlc_remove(ITOV(dp), namep); /* * We can collapse all the directory * entries that are deleted into one big entry * but the better way is to * defer it till next directory entry * creation. where we can do this * in a more efficient way */ fid = slot.ep; /* * If this is the last entry * just truncate the file instead * of marking it deleted */ if ((slot.offset + FID_LEN(fid)) == dp->i_size) { fbrelse(slot.fbp, S_OTHER); if ((err = ud_itrunc(dp, slot.offset, 0, cr)) != 0) { goto out; } } else { fid->fid_flags |= FID_DELETED; if ((err = ud_ip_off2bno(dp, slot.offset, &tbno)) != 0) { goto out; } ud_make_tag(dp->i_udf, &fid->fid_tag, UD_FILE_ID_DESC, tbno, FID_LEN(fid)); err = ud_write_fid(dp, &slot, buf); } slot.fbp = NULL; /* * If we were removing a directory, it is 'gone' now so we can * unlock it. */ if (mode == VDIR) { rw_exit(&ip->i_rwlock); } mutex_enter(&dp->i_tlock); dp->i_flag |= IUPD|ICHG; mutex_exit(&dp->i_tlock); mutex_enter(&ip->i_tlock); ip->i_flag |= ICHG; mutex_exit(&ip->i_tlock); if (err != 0) { goto out; } rw_enter(&ip->i_contents, RW_WRITER); /* * Now dispose of the inode. */ if (ip->i_nlink > 0) { if ((op == DR_RMDIR) && (ip->i_type == VDIR)) { /* * Decrement by 1 because there is no "." * Clear the inode, but there may be other hard * links so don't free the inode. * Decrement the dp linkcount because we're * trashing the ".." entry. */ ip->i_nlink --; dp->i_nlink--; dnlc_remove(ITOV(ip), "."); dnlc_remove(ITOV(ip), ".."); /* * (void) ud_itrunc(ip, 0, 0, cr); */ } else { ip->i_nlink--; } } ITIMES_NOLOCK(dp); ITIMES_NOLOCK(ip); rw_exit(&ip->i_contents); out: if (mode == VDIR) { vn_vfsunlock(ITOV(ip)); } out_novfs: ASSERT(RW_WRITE_HELD(&dp->i_contents)); if (slot.fbp != NULL) { fbrelse(slot.fbp, S_OTHER); } rw_exit(&dp->i_contents); if (ip) { /* * If no errors, send any events after locks are dropped, * but before the VN_RELE(). */ if (err == 0) { if (op == DR_REMOVE) { vnevent_remove(ITOV(ip), ITOV(dp), namep, ctp); } else if (op == DR_RMDIR) { vnevent_rmdir(ITOV(ip), ITOV(dp), namep, ctp); } } VN_RELE(ITOV(ip)); } kmem_free(buf, udf_vfsp->udf_lbsize); return (err); }
/* * Notify registered targets except 'self' about register value change */ static void s1394_cmp_notify_reg_change(s1394_hal_t *hal, t1394_cmp_reg_t reg, s1394_target_t *self) { s1394_target_t *target; s1394_fa_target_t *fat; uint_t saved_gen; int num_retries = 0; void (*cb)(opaque_t, t1394_cmp_reg_t); opaque_t arg; TNF_PROBE_0_DEBUG(s1394_cmp_notify_reg_change_enter, S1394_TNF_SL_CMP_STACK, ""); rw_enter(&hal->target_list_rwlock, RW_READER); start: target = hal->hal_fa[S1394_FA_TYPE_CMP].fal_head; for (; target; target = fat->fat_next) { fat = &target->target_fa[S1394_FA_TYPE_CMP]; /* * even if the target list changes when the lock is dropped, * comparing with self is safe because the target should * not unregister until all CMP operations are completed */ if (target == self) { continue; } cb = fat->fat_u.cmp.cm_evts.cmp_reg_change; if (cb == NULL) { continue; } arg = fat->fat_u.cmp.cm_evts.cmp_arg; saved_gen = s1394_fa_list_gen(hal, S1394_FA_TYPE_CMP); rw_exit(&hal->target_list_rwlock); cb(arg, reg); rw_enter(&hal->target_list_rwlock, RW_READER); /* * List could change while we dropped the lock. In such * case, start all over again, because missing a register * change can have more serious consequences for a * target than receiving same notification more than once */ if (saved_gen != s1394_fa_list_gen(hal, S1394_FA_TYPE_CMP)) { TNF_PROBE_2(s1394_cmp_notify_reg_change_error, S1394_TNF_SL_CMP_ERROR, "", tnf_string, msg, "list gen changed", tnf_opaque, num_retries, num_retries); if (++num_retries <= s1394_cmp_notify_retry_cnt) { goto start; } else { break; } } } rw_exit(&hal->target_list_rwlock); TNF_PROBE_0_DEBUG(s1394_cmp_notify_reg_change_exit, S1394_TNF_SL_CMP_STACK, ""); }
/* * sync out AVL trees to persistent storage. */ void zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx) { nvlist_t *nvp; nvlist_t **fuids; size_t nvsize = 0; char *packed; dmu_buf_t *db; fuid_domain_t *domnode; int numnodes; int i; if (!zfsvfs->z_fuid_dirty) { return; } rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); /* * First see if table needs to be created? */ if (zfsvfs->z_fuid_obj == 0) { zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, sizeof (uint64_t), tx); VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, sizeof (uint64_t), 1, &zfsvfs->z_fuid_obj, tx) == 0); } VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); numnodes = avl_numnodes(&zfsvfs->z_fuid_idx); fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP); for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++, domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) { VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, domnode->f_idx) == 0); VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0); VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN, domnode->f_ksid->kd_name) == 0); } VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, fuids, numnodes) == 0); for (i = 0; i != numnodes; i++) nvlist_free(fuids[i]); kmem_free(fuids, numnodes * sizeof (void *)); VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); packed = kmem_alloc(nvsize, KM_SLEEP); VERIFY(nvlist_pack(nvp, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); nvlist_free(nvp); zfsvfs->z_fuid_size = nvsize; dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, zfsvfs->z_fuid_size, packed, tx); kmem_free(packed, zfsvfs->z_fuid_size); VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; dmu_buf_rele(db, FTAG); zfsvfs->z_fuid_dirty = B_FALSE; rw_exit(&zfsvfs->z_fuid_lock); }
void memsegs_lock(int writer) { rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER); }
/* * Find all 'allow' permissions from a given point and then continue * traversing up to the root. * * This function constructs an nvlist of nvlists. * each setpoint is an nvlist composed of an nvlist of an nvlist * of the individual * users/groups/everyone/create * permissions. * * The nvlist will look like this. * * { source fsname -> { whokeys { permissions,...}, ...}} * * The fsname nvpairs will be arranged in a bottom up order. For example, * if we have the following structure a/b/c then the nvpairs for the fsnames * will be ordered a/b/c, a/b, a. */ int dsl_deleg_get(const char *ddname, nvlist_t **nvp) { dsl_dir_t *dd, *startdd; dsl_pool_t *dp; int error; objset_t *mos; error = dsl_dir_open(ddname, FTAG, &startdd, NULL); if (error) return (error); dp = startdd->dd_pool; mos = dp->dp_meta_objset; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); rw_enter(&dp->dp_config_rwlock, RW_READER); for (dd = startdd; dd != NULL; dd = dd->dd_parent) { zap_cursor_t basezc; zap_attribute_t baseza; nvlist_t *sp_nvp; uint64_t n; char source[MAXNAMELEN]; if (dd->dd_phys->dd_deleg_zapobj && (zap_count(mos, dd->dd_phys->dd_deleg_zapobj, &n) == 0) && n) { VERIFY(nvlist_alloc(&sp_nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); } else { continue; } for (zap_cursor_init(&basezc, mos, dd->dd_phys->dd_deleg_zapobj); zap_cursor_retrieve(&basezc, &baseza) == 0; zap_cursor_advance(&basezc)) { zap_cursor_t zc; zap_attribute_t za; nvlist_t *perms_nvp; ASSERT(baseza.za_integer_length == 8); ASSERT(baseza.za_num_integers == 1); VERIFY(nvlist_alloc(&perms_nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (zap_cursor_init(&zc, mos, baseza.za_first_integer); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { VERIFY(nvlist_add_boolean(perms_nvp, za.za_name) == 0); } zap_cursor_fini(&zc); VERIFY(nvlist_add_nvlist(sp_nvp, baseza.za_name, perms_nvp) == 0); nvlist_free(perms_nvp); } zap_cursor_fini(&basezc); dsl_dir_name(dd, source); VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0); nvlist_free(sp_nvp); } rw_exit(&dp->dp_config_rwlock); dsl_dir_close(startdd, FTAG); return (0); }
void memlist_write_lock(void) { rw_enter(&memlists_lock, RW_WRITER); }
/* * Check if user has requested permission. */ int dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) { dsl_dataset_t *ds; dsl_dir_t *dd; dsl_pool_t *dp; void *cookie; int error; char checkflag; objset_t *mos; avl_tree_t permsets; perm_set_t *setnode; error = dsl_dataset_hold(dsname, FTAG, &ds); if (error) return (error); dp = ds->ds_dir->dd_pool; mos = dp->dp_meta_objset; if (dsl_delegation_on(mos) == B_FALSE) { dsl_dataset_rele(ds, FTAG); return (ECANCELED); } if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) < SPA_VERSION_DELEGATED_PERMS) { dsl_dataset_rele(ds, FTAG); return (EPERM); } if (dsl_dataset_is_snapshot(ds)) { /* * Snapshots are treated as descendents only, * local permissions do not apply. */ checkflag = ZFS_DELEG_DESCENDENT; } else { checkflag = ZFS_DELEG_LOCAL; } avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), offsetof(perm_set_t, p_node)); rw_enter(&dp->dp_config_rwlock, RW_READER); for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent, checkflag = ZFS_DELEG_DESCENDENT) { uint64_t zapobj; boolean_t expanded; /* * If not in global zone then make sure * the zoned property is set */ if (!INGLOBALZONE(curproc)) { uint64_t zoned; if (dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_ZONED), 8, 1, &zoned, NULL, B_FALSE) != 0) break; if (!zoned) break; } zapobj = dd->dd_phys->dd_deleg_zapobj; if (zapobj == 0) continue; dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr); again: expanded = B_FALSE; for (setnode = avl_first(&permsets); setnode; setnode = AVL_NEXT(&permsets, setnode)) { if (setnode->p_matched == B_TRUE) continue; /* See if this set directly grants this permission */ error = dsl_check_access(mos, zapobj, ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm); if (error == 0) goto success; if (error == EPERM) setnode->p_matched = B_TRUE; /* See if this set includes other sets */ error = dsl_load_sets(mos, zapobj, ZFS_DELEG_NAMED_SET_SETS, 0, setnode->p_setname, &permsets); if (error == 0) setnode->p_matched = expanded = B_TRUE; } /* * If we expanded any sets, that will define more sets, * which we need to check. */ if (expanded) goto again; error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr); if (error == 0) goto success; } error = EPERM; success: rw_exit(&dp->dp_config_rwlock); dsl_dataset_rele(ds, FTAG); cookie = NULL; while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL) kmem_free(setnode, sizeof (perm_set_t)); return (error); }
/* * This is the upward reentry point for packets arriving from the bridging * module and from mac_rx for links not part of a bridge. */ void mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) { mac_impl_t *mip = (mac_impl_t *)mh; mac_ring_t *mr = (mac_ring_t *)mrh; mac_soft_ring_set_t *mac_srs; mblk_t *bp = mp_chain; boolean_t hw_classified = B_FALSE; /* * If there are any promiscuous mode callbacks defined for * this MAC, pass them a copy if appropriate. */ if (mip->mi_promisc_list != NULL) mac_promisc_dispatch(mip, mp_chain, NULL); if (mr != NULL) { /* * If the SRS teardown has started, just return. The 'mr' * continues to be valid until the driver unregisters the mac. * Hardware classified packets will not make their way up * beyond this point once the teardown has started. The driver * is never passed a pointer to a flow entry or SRS or any * structure that can be freed much before mac_unregister. */ mutex_enter(&mr->mr_lock); if ((mr->mr_state != MR_INUSE) || (mr->mr_flag & (MR_INCIPIENT | MR_CONDEMNED | MR_QUIESCE))) { mutex_exit(&mr->mr_lock); freemsgchain(mp_chain); return; } if (mr->mr_classify_type == MAC_HW_CLASSIFIER) { hw_classified = B_TRUE; MR_REFHOLD_LOCKED(mr); } mutex_exit(&mr->mr_lock); /* * We check if an SRS is controlling this ring. * If so, we can directly call the srs_lower_proc * routine otherwise we need to go through mac_rx_classify * to reach the right place. */ if (hw_classified) { mac_srs = mr->mr_srs; /* * This is supposed to be the fast path. * All packets received though here were steered by * the hardware classifier, and share the same * MAC header info. */ mac_srs->srs_rx.sr_lower_proc(mh, (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE); MR_REFRELE(mr); return; } /* We'll fall through to software classification */ } else { flow_entry_t *flent; int err; rw_enter(&mip->mi_rw_lock, RW_READER); if (mip->mi_single_active_client != NULL) { flent = mip->mi_single_active_client->mci_flent_list; FLOW_TRY_REFHOLD(flent, err); rw_exit(&mip->mi_rw_lock); if (err == 0) { (flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp_chain, B_FALSE); FLOW_REFRELE(flent); return; } } else { rw_exit(&mip->mi_rw_lock); } } if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) { if ((bp = mac_rx_flow(mh, mrh, bp)) == NULL) return; } freemsgchain(bp); }
static int zpl_xattr_set(struct inode *ip, const char *name, const void *value, size_t size, int flags) { znode_t *zp = ITOZ(ip); zfs_sb_t *zsb = ZTOZSB(zp); cred_t *cr = CRED(); fstrans_cookie_t cookie; int where; int error; crhold(cr); cookie = spl_fstrans_mark(); rrm_enter_read(&(zsb)->z_teardown_lock, FTAG); rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER); /* * Before setting the xattr check to see if it already exists. * This is done to ensure the following optional flags are honored. * * XATTR_CREATE: fail if xattr already exists * XATTR_REPLACE: fail if xattr does not exist * * We also want to know if it resides in sa or dir, so we can make * sure we don't end up with duplicate in both places. */ error = __zpl_xattr_where(ip, name, &where, cr); if (error < 0) { if (error != -ENODATA) goto out; if (flags & XATTR_REPLACE) goto out; /* The xattr to be removed already doesn't exist */ error = 0; if (value == NULL) goto out; } else { error = -EEXIST; if (flags & XATTR_CREATE) goto out; } /* Preferentially store the xattr as a SA for better performance */ if (zsb->z_use_sa && zp->z_is_sa && (zsb->z_xattr_sa || (value == NULL && where & XATTR_IN_SA))) { error = zpl_xattr_set_sa(ip, name, value, size, flags, cr); if (error == 0) { /* * Successfully put into SA, we need to clear the one * in dir. */ if (where & XATTR_IN_DIR) zpl_xattr_set_dir(ip, name, NULL, 0, 0, cr); goto out; } } error = zpl_xattr_set_dir(ip, name, value, size, flags, cr); /* * Successfully put into dir, we need to clear the one in SA. */ if (error == 0 && (where & XATTR_IN_SA)) zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr); out: rw_exit(&ITOZ(ip)->z_xattr_lock); rrm_exit(&(zsb)->z_teardown_lock, FTAG); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); }
/* * Teardown the zfsvfs::z_os. * * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); if (!unmounting) { /* * We purge the parent filesystem's vfsp as the parent * filesystem and all of its snapshots have their vnode's * v_vfsp set to the parent's filesystem's vfsp. Note, * 'z_parent' is self referential for non-snapshots. */ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); #ifdef FREEBSD_NAMECACHE cache_purgevfs(zfsvfs->z_parent->z_vfs); #endif } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { rw_exit(&zfsvfs->z_teardown_inactive_lock); rrw_exit(&zfsvfs->z_teardown_lock, FTAG); return (EIO); } /* * At this point there are no vops active, and any new vops will * fail with EIO since we have z_teardown_lock for writer (only * relavent for forced unmount). * * Release all holds on dbufs. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) if (zp->z_dbuf) { ASSERT(ZTOV(zp)->v_count >= 0); zfs_znode_dmu_fini(zp); } mutex_exit(&zfsvfs->z_znodes_lock); /* * If we are unmounting, set the unmounted flag and let new vops * unblock. zfs_inactive will have the unmounted behavior, and all * other vops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; rrw_exit(&zfsvfs->z_teardown_lock, FTAG); rw_exit(&zfsvfs->z_teardown_inactive_lock); #ifdef __FreeBSD__ /* * Some znodes might not be fully reclaimed, wait for them. */ mutex_enter(&zfsvfs->z_znodes_lock); while (list_head(&zfsvfs->z_all_znodes) != NULL) { msleep(zfsvfs, &zfsvfs->z_znodes_lock, 0, "zteardown", 0); } mutex_exit(&zfsvfs->z_znodes_lock); #endif } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data */ if (dmu_objset_evict_dbufs(zfsvfs->z_os)) { txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); (void) dmu_objset_evict_dbufs(zfsvfs->z_os); } return (0); }
int rxi_GetIFInfo() { int i = 0; int different = 0; #ifndef AFS_SUN510_ENV ill_t *ill; ipif_t *ipif; #endif int rxmtu, maxmtu; int mtus[ADDRSPERSITE]; afs_uint32 addrs[ADDRSPERSITE]; afs_uint32 ifinaddr; memset(mtus, 0, sizeof(mtus)); memset(addrs, 0, sizeof(addrs)); #ifdef AFS_SUN510_ENV (void) rw_enter(&afsifinfo_lock, RW_READER); for (i = 0; (afsifinfo[i].ipaddr != NULL) && (i < ADDRSPERSITE); i++) { /* Ignore addresses which are down.. */ if (!(afsifinfo[i].flags & IFF_UP)) continue; /* Compute the Rx interface MTU */ rxmtu = (afsifinfo[i].mtu - RX_IPUDP_SIZE); ifinaddr = afsifinfo[i].ipaddr; if (myNetAddrs[i] != ifinaddr) different++; /* Copy interface MTU and address; adjust maxmtu */ mtus[i] = rxmtu; rxmtu = rxi_AdjustIfMTU(rxmtu); maxmtu = rxmtu * rxi_nRecvFrags + ((rxi_nRecvFrags - 1) * UDP_HDR_SIZE); maxmtu = rxi_AdjustMaxMTU(rxmtu, maxmtu); addrs[i] = ifinaddr; if (!rx_IsLoopbackAddr(ifinaddr) && maxmtu > rx_maxReceiveSize) { rx_maxReceiveSize = MIN(RX_MAX_PACKET_SIZE, maxmtu); rx_maxReceiveSize = MIN(rx_maxReceiveSize, rx_maxReceiveSizeUser); } } (void) rw_exit(&afsifinfo_lock); rx_maxJumboRecvSize = RX_HEADER_SIZE + rxi_nDgramPackets * RX_JUMBOBUFFERSIZE + (rxi_nDgramPackets - 1) * RX_JUMBOHEADERSIZE; rx_maxJumboRecvSize = MAX(rx_maxJumboRecvSize, rx_maxReceiveSize); if (different) { int j; for (j = 0; j < i; j++) { myNetMTUs[j] = mtus[j]; myNetAddrs[j] = addrs[j]; } } return different; }
/* * bufcall() and timeout() callback entry for read/write stream * * Requires Lock (( M: Mandatory, P: Prohibited, A: Allowed )) * -. uinst_t->lock : P * -. uinst_t->u_lock : P * -. uinst_t->l_lock : P * -. uinst_t->c_lock : P */ void oplmsu_cmn_bufcb(void *arg) { struct buf_tbl *buftbl = arg; lpath_t *lpath; ctrl_t *ctrl; queue_t *q; int lq_flag = 0; rw_enter(&oplmsu_uinst->lock, RW_WRITER); mutex_enter(&oplmsu_uinst->l_lock); lpath = oplmsu_uinst->first_lpath; while (lpath) { if ((buftbl == lpath->rbuftbl) && (buftbl->rw_flag == MSU_READ_SIDE)) { if ((lpath->rbuf_id == 0) && (lpath->rtout_id == 0)) { mutex_exit(&oplmsu_uinst->l_lock); rw_exit(&oplmsu_uinst->lock); } else { q = lpath->rbuftbl->q; lpath->rbuftbl->q = NULL; lpath->rbuftbl->rw_flag = UNDEFINED; if (lpath->rbuf_id) { lpath->rbuf_id = 0; } else { lpath->rtout_id = 0; } mutex_exit(&oplmsu_uinst->l_lock); if (oplmsu_queue_flag == 1) { lq_flag = 1; oplmsu_queue_flag = 0; } rw_exit(&oplmsu_uinst->lock); oplmsu_rcmn_high_qenable(q); if (lq_flag == 1) { rw_enter(&oplmsu_uinst->lock, RW_WRITER); oplmsu_queue_flag = 1; rw_exit(&oplmsu_uinst->lock); } } return; } lpath = lpath->l_next; } mutex_exit(&oplmsu_uinst->l_lock); mutex_enter(&oplmsu_uinst->c_lock); if ((ctrl = oplmsu_uinst->user_ctrl) != NULL) { if ((buftbl == ctrl->wbuftbl) && (buftbl->rw_flag == MSU_WRITE_SIDE)) { oplmsu_wbufcb_posthndl(ctrl); mutex_exit(&oplmsu_uinst->c_lock); rw_exit(&oplmsu_uinst->lock); return; } } if ((ctrl = oplmsu_uinst->meta_ctrl) != NULL) { if ((buftbl == ctrl->wbuftbl) && (buftbl->rw_flag == MSU_WRITE_SIDE)) { oplmsu_wbufcb_posthndl(ctrl); mutex_exit(&oplmsu_uinst->c_lock); rw_exit(&oplmsu_uinst->lock); return; } } mutex_exit(&oplmsu_uinst->c_lock); rw_exit(&oplmsu_uinst->lock); }
/* * srpt_ch_srp_cmd() */ static int srpt_ch_srp_cmd(srpt_channel_t *ch, srpt_iu_t *iu) { srp_cmd_req_t *cmd = (srp_cmd_req_t *)iu->iu_buf; srp_indirect_desc_t *i_desc; uint_t i_di_cnt; uint_t i_do_cnt; uint8_t do_fmt; uint8_t di_fmt; uint32_t *cur_desc_off; int i; ibt_status_t status; uint8_t addlen; DTRACE_SRP_2(task__command, srpt_channel_t, ch, srp_cmd_req_t, cmd); iu->iu_ch = ch; iu->iu_tag = cmd->cr_tag; /* * The SRP specification and SAM require support for bi-directional * data transfer, so we create a single buffer descriptor list that * in the IU buffer that covers the data-in and data-out buffers. * In practice we will just see unidirectional transfers with either * data-in or data out descriptors. If we were to take that as fact, * we could reduce overhead slightly. */ /* * additional length is a 6-bit number in 4-byte words, so multiply by 4 * to get bytes. */ addlen = cmd->cr_add_cdb_len & 0x3f; /* mask off 6 bits */ cur_desc_off = (uint32_t *)(void *)&cmd->cr_add_data; cur_desc_off += addlen; /* 32-bit arithmetic */ iu->iu_num_rdescs = 0; iu->iu_rdescs = (srp_direct_desc_t *)(void *)cur_desc_off; /* * Examine buffer description for Data In (i.e. data flows * to the initiator). */ i_do_cnt = i_di_cnt = 0; di_fmt = cmd->cr_buf_fmt >> 4; if (di_fmt == SRP_DATA_DESC_DIRECT) { iu->iu_num_rdescs = 1; cur_desc_off = (uint32_t *)(void *)&iu->iu_rdescs[1]; } else if (di_fmt == SRP_DATA_DESC_INDIRECT) { i_desc = (srp_indirect_desc_t *)iu->iu_rdescs; i_di_cnt = b2h32(i_desc->id_table.dd_len) / sizeof (srp_direct_desc_t); /* * Some initiators like OFED occasionally use the wrong counts, * so check total to allow for this. NOTE: we do not support * reading of the descriptor table from the initiator, so if * not all descriptors are in the IU we drop the task. */ if (i_di_cnt > (cmd->cr_dicnt + cmd->cr_docnt)) { SRPT_DPRINTF_L2("ch_srp_cmd, remote RDMA of" " descriptors not supported"); SRPT_DPRINTF_L2("ch_srp_cmd, sizeof entry (%d)," " i_di_cnt(%d), cr_dicnt(%d)", (uint_t)sizeof (srp_direct_desc_t), i_di_cnt, cmd->cr_dicnt); iu->iu_rdescs = NULL; return (1); } bcopy(&i_desc->id_desc[0], iu->iu_rdescs, sizeof (srp_direct_desc_t) * i_di_cnt); iu->iu_num_rdescs += i_di_cnt; cur_desc_off = (uint32_t *)(void *)&i_desc->id_desc[i_di_cnt]; } /* * Examine buffer description for Data Out (i.e. data flows * from the initiator). */ do_fmt = cmd->cr_buf_fmt & 0x0F; if (do_fmt == SRP_DATA_DESC_DIRECT) { if (di_fmt == SRP_DATA_DESC_DIRECT) { bcopy(cur_desc_off, &iu->iu_rdescs[iu->iu_num_rdescs], sizeof (srp_direct_desc_t)); } iu->iu_num_rdescs++; } else if (do_fmt == SRP_DATA_DESC_INDIRECT) { i_desc = (srp_indirect_desc_t *)cur_desc_off; i_do_cnt = b2h32(i_desc->id_table.dd_len) / sizeof (srp_direct_desc_t); /* * Some initiators like OFED occasionally use the wrong counts, * so check total to allow for this. NOTE: we do not support * reading of the descriptor table from the initiator, so if * not all descriptors are in the IU we drop the task. */ if ((i_di_cnt + i_do_cnt) > (cmd->cr_dicnt + cmd->cr_docnt)) { SRPT_DPRINTF_L2("ch_srp_cmd, remote RDMA of" " descriptors not supported"); SRPT_DPRINTF_L2("ch_srp_cmd, sizeof entry (%d)," " i_do_cnt(%d), cr_docnt(%d)", (uint_t)sizeof (srp_direct_desc_t), i_do_cnt, cmd->cr_docnt); iu->iu_rdescs = 0; return (1); } bcopy(&i_desc->id_desc[0], &iu->iu_rdescs[iu->iu_num_rdescs], sizeof (srp_direct_desc_t) * i_do_cnt); iu->iu_num_rdescs += i_do_cnt; } iu->iu_tot_xfer_len = 0; for (i = 0; i < iu->iu_num_rdescs; i++) { iu->iu_rdescs[i].dd_vaddr = b2h64(iu->iu_rdescs[i].dd_vaddr); iu->iu_rdescs[i].dd_hdl = b2h32(iu->iu_rdescs[i].dd_hdl); iu->iu_rdescs[i].dd_len = b2h32(iu->iu_rdescs[i].dd_len); iu->iu_tot_xfer_len += iu->iu_rdescs[i].dd_len; } #ifdef DEBUG if (srpt_errlevel >= SRPT_LOG_L4) { SRPT_DPRINTF_L4("ch_srp_cmd, iu->iu_tot_xfer_len (%d)", iu->iu_tot_xfer_len); for (i = 0; i < iu->iu_num_rdescs; i++) { SRPT_DPRINTF_L4("ch_srp_cmd, rdescs[%d].dd_vaddr" " (0x%08llx)", i, (u_longlong_t)iu->iu_rdescs[i].dd_vaddr); SRPT_DPRINTF_L4("ch_srp_cmd, rdescs[%d].dd_hdl" " (0x%08x)", i, iu->iu_rdescs[i].dd_hdl); SRPT_DPRINTF_L4("ch_srp_cmd, rdescs[%d].dd_len (%d)", i, iu->iu_rdescs[i].dd_len); } SRPT_DPRINTF_L4("ch_srp_cmd, LUN (0x%08lx)", (unsigned long int) *((uint64_t *)(void *) cmd->cr_lun)); } #endif rw_enter(&ch->ch_rwlock, RW_READER); if (ch->ch_state == SRPT_CHANNEL_DISCONNECTING) { /* * The channel has begun disconnecting, so ignore the * the command returning the IU resources. */ rw_exit(&ch->ch_rwlock); return (1); } /* * Once a SCSI task is allocated and assigned to the IU, it * owns those IU resources, which will be held until STMF * is notified the task is done (from a lport perspective). */ iu->iu_stmf_task = stmf_task_alloc(ch->ch_tgt->tp_lport, ch->ch_session->ss_ss, cmd->cr_lun, SRP_CDB_SIZE + (addlen * 4), 0); if (iu->iu_stmf_task == NULL) { /* * Could not allocate, return status to the initiator * indicating that we are temporarily unable to process * commands. If unable to send, immediately return IU * resource. */ SRPT_DPRINTF_L2("ch_srp_cmd, SCSI task allocation failure"); rw_exit(&ch->ch_rwlock); mutex_enter(&iu->iu_lock); status = srpt_stp_send_response(iu, STATUS_BUSY, 0, 0, 0, NULL, SRPT_NO_FENCE_SEND); mutex_exit(&iu->iu_lock); if (status != IBT_SUCCESS) { SRPT_DPRINTF_L2("ch_srp_cmd, error(%d) posting error" " response", status); return (1); } else { return (0); } } iu->iu_stmf_task->task_port_private = iu; iu->iu_stmf_task->task_flags = 0; if (di_fmt != 0) { iu->iu_stmf_task->task_flags |= TF_WRITE_DATA; } if (do_fmt != 0) { iu->iu_stmf_task->task_flags |= TF_READ_DATA; } switch (cmd->cr_task_attr) { case SRP_TSK_ATTR_QTYPE_SIMPLE: iu->iu_stmf_task->task_flags |= TF_ATTR_SIMPLE_QUEUE; break; case SRP_TSK_ATTR_QTYPE_HEAD_OF_Q: iu->iu_stmf_task->task_flags |= TF_ATTR_HEAD_OF_QUEUE; break; case SRP_TSK_ATTR_QTYPE_ORDERED: iu->iu_stmf_task->task_flags |= TF_ATTR_ORDERED_QUEUE; break; case SRP_TSK_ATTR_QTYPE_ACA_Q_TAG: iu->iu_stmf_task->task_flags |= TF_ATTR_ACA; break; default: SRPT_DPRINTF_L2("ch_srp_cmd, reserved task attr (%d)", cmd->cr_task_attr); iu->iu_stmf_task->task_flags |= TF_ATTR_ORDERED_QUEUE; break; } iu->iu_stmf_task->task_additional_flags = 0; iu->iu_stmf_task->task_priority = 0; iu->iu_stmf_task->task_mgmt_function = TM_NONE; iu->iu_stmf_task->task_max_nbufs = STMF_BUFS_MAX; iu->iu_stmf_task->task_expected_xfer_length = iu->iu_tot_xfer_len; iu->iu_stmf_task->task_csn_size = 0; bcopy(cmd->cr_cdb, iu->iu_stmf_task->task_cdb, SRP_CDB_SIZE); if (addlen != 0) { bcopy(&cmd->cr_add_data, iu->iu_stmf_task->task_cdb + SRP_CDB_SIZE, addlen * 4); } /* * Add the IU/task to the session and post to STMF. The task will * remain in the session's list until STMF is informed by SRP that * it is done with the task. */ DTRACE_SRP_3(scsi__command, srpt_channel_t, iu->iu_ch, scsi_task_t, iu->iu_stmf_task, srp_cmd_req_t, cmd); srpt_stp_add_task(ch->ch_session, iu); SRPT_DPRINTF_L3("ch_srp_cmd, new task (%p) posted", (void *)iu->iu_stmf_task); stmf_post_task(iu->iu_stmf_task, NULL); rw_exit(&ch->ch_rwlock); return (0); }