Ejemplo n.º 1
0
/*
 * Query domain table for a given domain.
 *
 * If domain isn't found and addok is set, it is added to AVL trees and
 * the zsb->z_fuid_dirty flag will be set to TRUE.  It will then be
 * necessary for the caller or another thread to detect the dirty table
 * and sync out the changes.
 */
int
zfs_fuid_find_by_domain(zfs_sb_t *zsb, const char *domain,
    char **retdomain, boolean_t addok)
{
	fuid_domain_t searchnode, *findnode;
	avl_index_t loc;
	krw_t rw = RW_READER;

	/*
	 * If the dummy "nobody" domain then return an index of 0
	 * to cause the created FUID to be a standard POSIX id
	 * for the user nobody.
	 */
	if (domain[0] == '\0') {
		if (retdomain)
			*retdomain = nulldomain;
		return (0);
	}

	searchnode.f_ksid = ksid_lookupdomain(domain);
	if (retdomain)
		*retdomain = searchnode.f_ksid->kd_name;
	if (!zsb->z_fuid_loaded)
		zfs_fuid_init(zsb);

retry:
	rw_enter(&zsb->z_fuid_lock, rw);
	findnode = avl_find(&zsb->z_fuid_domain, &searchnode, &loc);

	if (findnode) {
		rw_exit(&zsb->z_fuid_lock);
		ksiddomain_rele(searchnode.f_ksid);
		return (findnode->f_idx);
	} else if (addok) {
		fuid_domain_t *domnode;
		uint64_t retidx;

		if (rw == RW_READER && !rw_tryupgrade(&zsb->z_fuid_lock)) {
			rw_exit(&zsb->z_fuid_lock);
			rw = RW_WRITER;
			goto retry;
		}

		domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
		domnode->f_ksid = searchnode.f_ksid;

		retidx = domnode->f_idx = avl_numnodes(&zsb->z_fuid_idx) + 1;

		avl_add(&zsb->z_fuid_domain, domnode);
		avl_add(&zsb->z_fuid_idx, domnode);
		zsb->z_fuid_dirty = B_TRUE;
		rw_exit(&zsb->z_fuid_lock);
		return (retidx);
	} else {
		rw_exit(&zsb->z_fuid_lock);
		return (-1);
	}
}
Ejemplo n.º 2
0
/*
 * Build directory vnodes based on the profile and the global
 * dev instance.
 */
void
prof_filldir(sdev_node_t *ddv)
{
	sdev_node_t *gdir;

	ASSERT(RW_READ_HELD(&ddv->sdev_contents));

	if (!prof_dev_needupdate(ddv)) {
		ASSERT(RW_READ_HELD(&ddv->sdev_contents));
		return;
	}
	/*
	 * Upgrade to writer lock
	 */
	if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
		/*
		 * We need to drop the read lock and re-acquire it as a
		 * write lock. While we do this the condition may change so we
		 * need to re-check condition
		 */
		rw_exit(&ddv->sdev_contents);
		rw_enter(&ddv->sdev_contents, RW_WRITER);
		if (!prof_dev_needupdate(ddv)) {
			/* Downgrade back to the read lock before returning */
			rw_downgrade(&ddv->sdev_contents);
			return;
		}
	}
	/* At this point we should have a write lock */
	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));

	sdcmn_err10(("devtree_gen (%s): %ld -> %ld\n",
	    ddv->sdev_path, ddv->sdev_devtree_gen, devtree_gen));

	gdir = ddv->sdev_origin;

	if (gdir != NULL)
		sdcmn_err10(("sdev_dir_gen (%s): %ld -> %ld\n",
		    ddv->sdev_path, ddv->sdev_ldir_gen,
		    gdir->sdev_gdir_gen));

	/* update flags and generation number so next filldir is quick */
	if ((ddv->sdev_flags & SDEV_BUILD) == SDEV_BUILD) {
		ddv->sdev_flags &= ~SDEV_BUILD;
	}
	ddv->sdev_devtree_gen = devtree_gen;
	if (gdir != NULL)
		ddv->sdev_ldir_gen = gdir->sdev_gdir_gen;

	prof_make_symlinks(ddv);
	prof_make_maps(ddv);
	prof_make_names(ddv);
	rw_downgrade(&ddv->sdev_contents);
}
Ejemplo n.º 3
0
static int
zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
{
	if (RW_WRITE_HELD(&zap->zap_rwlock))
		return (1);
	if (rw_tryupgrade(&zap->zap_rwlock)) {
		dmu_buf_will_dirty(zap->zap_dbuf, tx);
		return (1);
	}
	return (0);
}
Ejemplo n.º 4
0
static int
splat_rwlock_test6(struct file *file, void *arg)
{
	rw_priv_t *rwp;
	int rc = -EINVAL;

	rwp = (rw_priv_t *)kmalloc(sizeof(*rwp), GFP_KERNEL);
	if (rwp == NULL)
		return -ENOMEM;

	splat_init_rw_priv(rwp, file);

	rw_enter(&rwp->rw_rwlock, RW_READER);
	if (!RW_READ_HELD(&rwp->rw_rwlock)) {
		splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME,
		             "rwlock should be read lock: %d\n",
			     RW_READ_HELD(&rwp->rw_rwlock));
		goto out;
	}
#if defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
	/* With one reader upgrade should never fail */
	rc = rw_tryupgrade(&rwp->rw_rwlock);
	if (!rc) {
		splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME,
			     "rwlock contended preventing upgrade: %d\n",
			     RW_READ_HELD(&rwp->rw_rwlock));
		goto out;
	}

	if (RW_READ_HELD(&rwp->rw_rwlock) || !RW_WRITE_HELD(&rwp->rw_rwlock)) {
		splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "rwlock should "
			   "have 0 (not %d) reader and 1 (not %d) writer\n",
			   RW_READ_HELD(&rwp->rw_rwlock),
			   RW_WRITE_HELD(&rwp->rw_rwlock));
		goto out;
	}

	rc = 0;
	splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "%s",
		     "rwlock properly upgraded\n");
#else
        rc = 0;
        splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "%s",
                "rw_tryupgrade() is disabled for this arch\n");
#endif

out:
	rw_exit(&rwp->rw_rwlock);
	rw_destroy(&rwp->rw_rwlock);
	kfree(rwp);

	return rc;
}
Ejemplo n.º 5
0
/*
 * Clean pts sdev_nodes that are no longer valid.
 */
static void
devpts_prunedir(struct sdev_node *ddv)
{
	struct vnode *vp;
	struct sdev_node *dv, *next = NULL;
	int (*vtor)(struct sdev_node *) = NULL;

	ASSERT(ddv->sdev_flags & SDEV_VTOR);

	vtor = (int (*)(struct sdev_node *))sdev_get_vtor(ddv);
	ASSERT(vtor);

	if (rw_tryupgrade(&ddv->sdev_contents) == NULL) {
		rw_exit(&ddv->sdev_contents);
		rw_enter(&ddv->sdev_contents, RW_WRITER);
	}

	for (dv = ddv->sdev_dot; dv; dv = next) {
		next = dv->sdev_next;

		/* skip stale nodes */
		if (dv->sdev_flags & SDEV_STALE)
			continue;

		/* validate and prune only ready nodes */
		if (dv->sdev_state != SDEV_READY)
			continue;

		switch (vtor(dv)) {
		case SDEV_VTOR_VALID:
		case SDEV_VTOR_SKIP:
			continue;
		case SDEV_VTOR_INVALID:
			sdcmn_err7(("prunedir: destroy invalid "
			    "node: %s(%p)\n", dv->sdev_name, (void *)dv));
			break;
		}
		vp = SDEVTOV(dv);
		if (vp->v_count > 0)
			continue;
		SDEV_HOLD(dv);
		/* remove the cache node */
		(void) sdev_cache_update(ddv, &dv, dv->sdev_name,
		    SDEV_CACHE_DELETE);
	}
	rw_downgrade(&ddv->sdev_contents);
}
Ejemplo n.º 6
0
/*
 * This is the predictive prefetch entry point.  It associates dnode access
 * specified with blkid and nblks arguments with prefetch stream, predicts
 * further accesses based on that stats and initiates speculative prefetch.
 * fetch_data argument specifies whether actual data blocks should be fetched:
 *   FALSE -- prefetch only indirect blocks for predicted data blocks;
 *   TRUE -- prefetch predicted data blocks plus following indirect blocks.
 */
void
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
{
	zstream_t *zs;
	int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
	int64_t pf_ahead_blks, max_blks;
	int epbs, max_dist_blks, pf_nblks, ipf_nblks;
	uint64_t end_of_access_blkid = blkid + nblks;

	if (zfs_prefetch_disable)
		return;

	/*
	 * As a fast path for small (single-block) files, ignore access
	 * to the first block.
	 */
	if (blkid == 0)
		return;

	rw_enter(&zf->zf_rwlock, RW_READER);

	for (zs = list_head(&zf->zf_stream); zs != NULL;
	    zs = list_next(&zf->zf_stream, zs)) {
		if (blkid == zs->zs_blkid) {
			mutex_enter(&zs->zs_lock);
			/*
			 * zs_blkid could have changed before we
			 * acquired zs_lock; re-check them here.
			 */
			if (blkid != zs->zs_blkid) {
				mutex_exit(&zs->zs_lock);
				continue;
			}
			break;
		}
	}

	if (zs == NULL) {
		/*
		 * This access is not part of any existing stream.  Create
		 * a new stream for it.
		 */
		ZFETCHSTAT_BUMP(zfetchstat_misses);
		if (rw_tryupgrade(&zf->zf_rwlock))
			dmu_zfetch_stream_create(zf, end_of_access_blkid);
		rw_exit(&zf->zf_rwlock);
		return;
	}

	/*
	 * This access was to a block that we issued a prefetch for on
	 * behalf of this stream. Issue further prefetches for this stream.
	 *
	 * Normally, we start prefetching where we stopped
	 * prefetching last (zs_pf_blkid).  But when we get our first
	 * hit on this stream, zs_pf_blkid == zs_blkid, we don't
	 * want to prefetch the block we just accessed.  In this case,
	 * start just after the block we just accessed.
	 */
	pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);

	/*
	 * Double our amount of prefetched data, but don't let the
	 * prefetch get further ahead than zfetch_max_distance.
	 */
	if (fetch_data) {
		max_dist_blks =
		    zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
		/*
		 * Previously, we were (zs_pf_blkid - blkid) ahead.  We
		 * want to now be double that, so read that amount again,
		 * plus the amount we are catching up by (i.e. the amount
		 * read just now).
		 */
		pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
		max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
		pf_nblks = MIN(pf_ahead_blks, max_blks);
	} else {
Ejemplo n.º 7
0
int
rumpuser_rw_tryupgrade(struct rumpuser_rw *rw)
{

	return rw_tryupgrade(rw);
}
Ejemplo n.º 8
0
/*
 * Find the policy that matches this device.
 */
static devplcy_t *
match_policy(devplcyent_t *de, dev_t dev, vtype_t spec)
{
	char *mname = NULL;
	minor_t min = getminor(dev);

	for (; de != NULL; de = de->dpe_next) {
		if (de->dpe_flags & DPE_ALLMINOR)
			break;

		if (de->dpe_flags & DPE_EXPANDED) {
			if (min >= de->dpe_lomin && min <= de->dpe_himin &&
			    spec == de->dpe_spec) {
				break;
			} else {
				continue;
			}
		}

		/*
		 * We now need the minor name to match string or
		 * simle regexp.  Could we use csp->s_dip and not
		 * allocate a string here?
		 */
		if (mname == NULL &&
		    ddi_lyr_get_minor_name(dev, spec, &mname) != DDI_SUCCESS)
			/* mname can be set after the function fails */
			return (dfltpolicy);

		/* Simple wildcard, with only one ``*'' */
		if (de->dpe_flags & DPE_WILDC) {
			int plen = de->dpe_len - 1;
			int slen = strlen(mname);
			char *pp = de->dpe_expr;
			char *sp = mname;

			/* string must be at least as long as pattern w/o '*' */
			if (slen < plen - 1)
				continue;

			/* skip prefix */
			while (*pp == *sp && *pp != '\0') {
				pp++;
				sp++;
			}
			/* matched single '*' */
			if (*pp == '\0')
				if (*sp == '\0')
					break;
				else
					continue;
			if (*pp != '*')
				continue;

			pp++;
			/*
			 * skip characters matched by '*': difference of
			 * length of s and length of pattern sans '*'
			 */
			sp += slen - (plen - 1);
			if (strcmp(pp, sp) == 0) 	/* match! */
				break;

		} else if (strcmp(de->dpe_expr, mname) == 0) {
			/* Store minor number, if no contention */
			if (rw_tryupgrade(&policyrw)) {
				de->dpe_lomin = de->dpe_himin = min;
				de->dpe_spec = spec;
				de->dpe_flags |= DPE_EXPANDED;
			}
			break;
		}

	}

	if (mname != NULL)
		kmem_free(mname, strlen(mname) + 1);

	return (de != NULL ? de->dpe_plcy : dfltpolicy);
}
Ejemplo n.º 9
0
static int
nvpflush_one(nvfd_t *nvfd)
{
	int rval = DDI_SUCCESS;
	nvlist_t *nvl;

	rw_enter(&nvfd->nvf_lock, RW_READER);

	if (!NVF_IS_DIRTY(nvfd) || NVF_IS_READONLY(nvfd)) {
		rw_exit(&nvfd->nvf_lock);
		return (DDI_SUCCESS);
	}

	if (rw_tryupgrade(&nvfd->nvf_lock) == 0) {
		KFIOERR((CE_CONT, "nvpflush: "
		    "%s rw upgrade failed\n", nvfd->nvf_name));
		rw_exit(&nvfd->nvf_lock);
		return (DDI_FAILURE);
	}
	if (((nvfd->nvf_nvp2nvl)(nvfd, &nvl)) != DDI_SUCCESS) {
		KFIOERR((CE_CONT, "nvpflush: "
		    "%s nvlist construction failed\n", nvfd->nvf_name));
		rw_exit(&nvfd->nvf_lock);
		return (DDI_FAILURE);
	}

	NVF_CLEAR_DIRTY(nvfd);
	nvfd->nvf_flags |= NVF_FLUSHING;
	rw_exit(&nvfd->nvf_lock);

	rval = e_fwrite_nvlist(nvfd, nvl);
	nvlist_free(nvl);

	rw_enter(&nvfd->nvf_lock, RW_WRITER);
	nvfd->nvf_flags &= ~NVF_FLUSHING;
	if (rval == DDI_FAILURE) {
		if (NVF_IS_READONLY(nvfd)) {
			rval = DDI_SUCCESS;
			nvfd->nvf_flags &= ~(NVF_ERROR | NVF_DIRTY);
		} else if ((nvfd->nvf_flags & NVF_ERROR) == 0) {
			cmn_err(CE_CONT,
			    "%s: updated failed\n", nvfd->nvf_name);
			nvfd->nvf_flags |= NVF_ERROR | NVF_DIRTY;
		}
	} else {
		if (nvfd->nvf_flags & NVF_CREATE_MSG) {
			cmn_err(CE_CONT, "!Creating %s\n", nvfd->nvf_name);
			nvfd->nvf_flags &= ~NVF_CREATE_MSG;
		}
		if (nvfd->nvf_flags & NVF_REBUILD_MSG) {
			cmn_err(CE_CONT, "!Rebuilding %s\n", nvfd->nvf_name);
			nvfd->nvf_flags &= ~NVF_REBUILD_MSG;
		}
		if (nvfd->nvf_flags & NVF_ERROR) {
			cmn_err(CE_CONT,
			    "%s: update now ok\n", nvfd->nvf_name);
			nvfd->nvf_flags &= ~NVF_ERROR;
		}
		/*
		 * The file may need to be flushed again if the cached
		 * data was touched while writing the earlier contents.
		 */
		if (NVF_IS_DIRTY(nvfd))
			rval = DDI_FAILURE;
	}

	rw_exit(&nvfd->nvf_lock);
	return (rval);
}
Ejemplo n.º 10
0
int
sam_refresh_shared_reader_ino(
	sam_node_t *ip,			/* Pointer to the inode */
	boolean_t writelock,		/* Inode WRITER lock held, */
					/*   otherwise READER lock held. */
	cred_t *credp)			/* credentials. */
{
	sam_id_t id;
	struct sam_perm_inode *permip;
	buf_t *bp;
	int refresh = 0;
	int error;

	if ((ip->updtime + ip->mp->mt.fi_invalid) > SAM_SECOND()) {
		return (0);
	}

	if (!writelock) {
		/*
		 * Acquire inode lock before buffer lock. Recheck the update
		 * time.
		 */
		if (!rw_tryupgrade(&ip->inode_rwl)) {
			RW_UNLOCK_OS(&ip->inode_rwl, RW_READER);
			RW_LOCK_OS(&ip->inode_rwl, RW_WRITER);
			if ((ip->updtime + ip->mp->mt.fi_invalid) >
			    SAM_SECOND()) {
				error = 0;
				goto out;
			}
		}
	}
	id = ip->di.id;
	if ((error = sam_read_ino(ip->mp, id.ino, &bp, &permip))) {
		goto out;
	}
	if ((permip->di.mode != 0) && (permip->di.id.ino == ip->di.id.ino) &&
	    (permip->di.id.gen == ip->di.id.gen)) {
		if ((permip->di.modify_time.tv_sec !=
		    ip->di.modify_time.tv_sec) ||
		    (permip->di.modify_time.tv_nsec !=
		    ip->di.modify_time.tv_nsec)||
		    (permip->di.change_time.tv_sec !=
		    ip->di.change_time.tv_sec) ||
		    (permip->di.change_time.tv_nsec !=
		    ip->di.change_time.tv_nsec)||
		    (permip->di.residence_time != ip->di.residence_time) ||
		    (permip->di.rm.size != ip->di.rm.size) ||
		    (permip->di.mode != ip->di.mode)) {
			refresh = 1;
		} else {
			ip->di.uid = permip->di.uid;
			ip->di.gid = permip->di.gid;
		}
	} else {
		refresh = 1;
		error = ENOENT;		/* This inode has been removed */
	}
	if (refresh) {
		vnode_t *vp = SAM_ITOV(ip);

		/*
		 * If a refresh is needed on a directory inode,
		 * invalidate associated dnlc entries.
		 */
		if (S_ISDIR(ip->di.mode)) {
			sam_invalidate_dnlc(vp);
		}

		/*
		 * Move shared_writer's inode copy into inode. Set size
		 * and invalidate pages. Set shared_reader update time.
		 */
		ip->di = permip->di; /* Move disk ino to incore ino */
		ip->di2 = permip->di2;
		brelse(bp);
		vp->v_type = IFTOVT(S_ISREQ(ip->di.mode) ?
		    S_IFREG : ip->di.mode);
		sam_set_size(ip);
		(void) VOP_PUTPAGE_OS(vp, 0, 0, B_INVAL, credp, NULL);
		if (ip->di.status.b.acl) {
			(void) sam_acl_inactive(ip);
			error = sam_get_acl(ip, &ip->aclp);
		}
		ip->updtime = SAM_SECOND();
	} else {
		ip->updtime = SAM_SECOND();
		brelse(bp);
	}

out:
	if (!writelock) {
		rw_downgrade(&ip->inode_rwl);
	}
	return (error);
}
Ejemplo n.º 11
0
static int
auto_lookup(
	vnode_t *dvp,
	char *nm,
	vnode_t **vpp,
	pathname_t *pnp,
	int flags,
	vnode_t *rdir,
	cred_t *cred,
	caller_context_t *ct,
	int *direntflags,
	pathname_t *realpnp)
{
	int error = 0;
	vnode_t *newvp = NULL;
	vfs_t *vfsp;
	fninfo_t *dfnip;
	fnnode_t *dfnp = NULL;
	fnnode_t *fnp = NULL;
	char *searchnm;
	int operation;		/* either AUTOFS_LOOKUP or AUTOFS_MOUNT */

	dfnip = vfstofni(dvp->v_vfsp);
	AUTOFS_DPRINT((3, "auto_lookup: dvp=%p (%s) name=%s\n",
	    (void *)dvp, dfnip->fi_map, nm));

	if (nm[0] == 0) {
		VN_HOLD(dvp);
		*vpp = dvp;
		return (0);
	}

	if (error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct))
		return (error);

	if (nm[0] == '.' && nm[1] == 0) {
		VN_HOLD(dvp);
		*vpp = dvp;
		return (0);
	}

	if (nm[0] == '.' && nm[1] == '.' && nm[2] == 0) {
		fnnode_t *pdfnp;

		pdfnp = (vntofn(dvp))->fn_parent;
		ASSERT(pdfnp != NULL);

		/*
		 * Since it is legitimate to have the VROOT flag set for the
		 * subdirectories of the indirect map in autofs filesystem,
		 * rootfnnodep is checked against fnnode of dvp instead of
		 * just checking whether VROOT flag is set in dvp
		 */

		if (pdfnp == pdfnp->fn_globals->fng_rootfnnodep) {
			vnode_t *vp;

			vfs_rlock_wait(dvp->v_vfsp);
			if (dvp->v_vfsp->vfs_flag & VFS_UNMOUNTED) {
				vfs_unlock(dvp->v_vfsp);
				return (EIO);
			}
			vp = dvp->v_vfsp->vfs_vnodecovered;
			VN_HOLD(vp);
			vfs_unlock(dvp->v_vfsp);
			error = VOP_LOOKUP(vp, nm, vpp, pnp, flags, rdir, cred,
			    ct, direntflags, realpnp);
			VN_RELE(vp);
			return (error);
		} else {
			*vpp = fntovn(pdfnp);
			VN_HOLD(*vpp);
			return (0);
		}
	}

top:
	dfnp = vntofn(dvp);
	searchnm = nm;
	operation = 0;

	ASSERT(vn_matchops(dvp, auto_vnodeops));

	AUTOFS_DPRINT((3, "auto_lookup: dvp=%p dfnp=%p\n", (void *)dvp,
	    (void *)dfnp));

	/*
	 * If a lookup or mount of this node is in progress, wait for it
	 * to finish, and return whatever result it got.
	 */
	mutex_enter(&dfnp->fn_lock);
	if (dfnp->fn_flags & (MF_LOOKUP | MF_INPROG)) {
		mutex_exit(&dfnp->fn_lock);
		error = auto_wait4mount(dfnp);
		if (error == AUTOFS_SHUTDOWN)
			error = ENOENT;
		if (error == EAGAIN)
			goto top;
		if (error)
			return (error);
	} else
		mutex_exit(&dfnp->fn_lock);


	error = vn_vfsrlock_wait(dvp);
	if (error)
		return (error);
	vfsp = vn_mountedvfs(dvp);
	if (vfsp != NULL) {
		error = VFS_ROOT(vfsp, &newvp);
		vn_vfsunlock(dvp);
		if (!error) {
			error = VOP_LOOKUP(newvp, nm, vpp, pnp,
			    flags, rdir, cred, ct, direntflags, realpnp);
			VN_RELE(newvp);
		}
		return (error);
	}
	vn_vfsunlock(dvp);

	rw_enter(&dfnp->fn_rwlock, RW_READER);
	error = auto_search(dfnp, nm, &fnp, cred);
	if (error) {
		if (dfnip->fi_flags & MF_DIRECT) {
			/*
			 * direct map.
			 */
			if (dfnp->fn_dirents) {
				/*
				 * Mount previously triggered.
				 * 'nm' not found
				 */
				error = ENOENT;
			} else {
				/*
				 * I need to contact the daemon to trigger
				 * the mount. 'dfnp' will be the mountpoint.
				 */
				operation = AUTOFS_MOUNT;
				VN_HOLD(fntovn(dfnp));
				fnp = dfnp;
				error = 0;
			}
		} else if (dvp == dfnip->fi_rootvp) {
			/*
			 * 'dfnp' is the root of the indirect AUTOFS.
			 */
			if (rw_tryupgrade(&dfnp->fn_rwlock) == 0) {
				/*
				 * Could not acquire writer lock, release
				 * reader, and wait until available. We
				 * need to search for 'nm' again, since we
				 * had to release the lock before reacquiring
				 * it.
				 */
				rw_exit(&dfnp->fn_rwlock);
				rw_enter(&dfnp->fn_rwlock, RW_WRITER);
				error = auto_search(dfnp, nm, &fnp, cred);
			}

			ASSERT(RW_WRITE_HELD(&dfnp->fn_rwlock));
			if (error) {
				/*
				 * create node being looked-up and request
				 * mount on it.
				 */
				error = auto_enter(dfnp, nm, &fnp, kcred);
				if (!error)
					operation = AUTOFS_LOOKUP;
			}
		} else if ((dfnp->fn_dirents == NULL) &&
		    ((dvp->v_flag & VROOT) == 0) &&
		    ((fntovn(dfnp->fn_parent))->v_flag & VROOT)) {
			/*
			 * dfnp is the actual 'mountpoint' of indirect map,
			 * it is the equivalent of a direct mount,
			 * ie, /home/'user1'
			 */
			operation = AUTOFS_MOUNT;
			VN_HOLD(fntovn(dfnp));
			fnp = dfnp;
			error = 0;
			searchnm = dfnp->fn_name;
		}
	}

	if (error == EAGAIN) {
		rw_exit(&dfnp->fn_rwlock);
		goto top;
	}
	if (error) {
		rw_exit(&dfnp->fn_rwlock);
		return (error);
	}

	/*
	 * We now have the actual fnnode we're interested in.
	 * The 'MF_LOOKUP' indicates another thread is currently
	 * performing a daemon lookup of this node, therefore we
	 * wait for its completion.
	 * The 'MF_INPROG' indicates another thread is currently
	 * performing a daemon mount of this node, we wait for it
	 * to be done if we are performing a MOUNT. We don't
	 * wait for it if we are performing a LOOKUP.
	 * We can release the reader/writer lock as soon as we acquire
	 * the mutex, since the state of the lock can only change by
	 * first acquiring the mutex.
	 */
	mutex_enter(&fnp->fn_lock);
	rw_exit(&dfnp->fn_rwlock);
	if ((fnp->fn_flags & MF_LOOKUP) ||
	    ((operation == AUTOFS_MOUNT) && (fnp->fn_flags & MF_INPROG))) {
		mutex_exit(&fnp->fn_lock);
		error = auto_wait4mount(fnp);
		VN_RELE(fntovn(fnp));
		if (error == AUTOFS_SHUTDOWN)
			error = ENOENT;
		if (error && error != EAGAIN)
			return (error);
		goto top;
	}

	if (operation == 0) {
		/*
		 * got the fnnode, check for any errors
		 * on the previous operation on that node.
		 */
		error = fnp->fn_error;
		if ((error == EINTR) || (error == EAGAIN)) {
			/*
			 * previous operation on this node was
			 * not completed, do a lookup now.
			 */
			operation = AUTOFS_LOOKUP;
		} else {
			/*
			 * previous operation completed. Return
			 * a pointer to the node only if there was
			 * no error.
			 */
			mutex_exit(&fnp->fn_lock);
			if (!error)
				*vpp = fntovn(fnp);
			else
				VN_RELE(fntovn(fnp));
			return (error);
		}
	}

	/*
	 * Since I got to this point, it means I'm the one
	 * responsible for triggering the mount/look-up of this node.
	 */
	switch (operation) {
	case AUTOFS_LOOKUP:
		AUTOFS_BLOCK_OTHERS(fnp, MF_LOOKUP);
		fnp->fn_error = 0;
		mutex_exit(&fnp->fn_lock);
		error = auto_lookup_aux(fnp, searchnm, cred);
		if (!error) {
			/*
			 * Return this vnode
			 */
			*vpp = fntovn(fnp);
		} else {
			/*
			 * release our reference to this vnode
			 * and return error
			 */
			VN_RELE(fntovn(fnp));
		}
		break;
	case AUTOFS_MOUNT:
		AUTOFS_BLOCK_OTHERS(fnp, MF_INPROG);
		fnp->fn_error = 0;
		mutex_exit(&fnp->fn_lock);
		/*
		 * auto_new_mount_thread fires up a new thread which
		 * calls automountd finishing up the work
		 */
		auto_new_mount_thread(fnp, searchnm, cred);

		/*
		 * At this point, we are simply another thread
		 * waiting for the mount to complete
		 */
		error = auto_wait4mount(fnp);
		if (error == AUTOFS_SHUTDOWN)
			error = ENOENT;

		/*
		 * now release our reference to this vnode
		 */
		VN_RELE(fntovn(fnp));
		if (!error)
			goto top;
		break;
	default:
		auto_log(dfnp->fn_globals->fng_verbose,
		    dfnp->fn_globals->fng_zoneid, CE_WARN,
		    "auto_lookup: unknown operation %d",
		    operation);
	}

	AUTOFS_DPRINT((5, "auto_lookup: name=%s *vpp=%p return=%d\n",
	    nm, (void *)*vpp, error));

	return (error);
}
Ejemplo n.º 12
0
/*ARGSUSED*/
int
ufs_rdwr_data(
	vnode_t		*vnodep,
	u_offset_t	offset,
	size_t		len,
	fdbuffer_t	*fdbp,
	int		flags,
	cred_t		*credp)
{
	struct inode	*ip = VTOI(vnodep);
	struct fs	*fs;
	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
	struct buf	*bp;
	krw_t		rwtype = RW_READER;
	u_offset_t	offset1 = offset;	/* Initial offset */
	size_t		iolen;
	int		curlen = 0;
	int		pplen;
	daddr_t		bn;
	int		contig = 0;
	int		error = 0;
	int		nbytes;			/* Number bytes this IO */
	int		offsetn;		/* Start point this IO */
	int		iswrite = flags & B_WRITE;
	int		io_started = 0;		/* No IO started */
	struct ulockfs	*ulp;
	uint_t		protp = PROT_ALL;

	error = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, segkmap, !iswrite,
	    &protp);
	if (error) {
		if (flags & B_ASYNC) {
			fdb_ioerrdone(fdbp, error);
		}
		return (error);
	}
	fs = ufsvfsp->vfs_fs;
	iolen = len;

	DEBUGF((CE_CONT, "?ufs_rdwr: %s vp: %p pages:%p  off %llx len %lx"
	    " isize: %llx fdb: %p\n",
	    flags & B_READ ? "READ" : "WRITE", (void *)vnodep,
	    (void *)vnodep->v_pages, offset1, iolen, ip->i_size, (void *)fdbp));

	rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER);
	rw_enter(&ip->i_contents, rwtype);

	ASSERT(offset1 < ip->i_size);

	if ((offset1 + iolen) > ip->i_size) {
		iolen = ip->i_size - offset1;
	}
	while (!error && curlen < iolen) {

		contig = 0;

		if ((error = bmap_read(ip, offset1, &bn, &contig)) != 0) {
			break;
		}
		ASSERT(!(bn == UFS_HOLE && iswrite));
		if (bn == UFS_HOLE) {
			/*
			 * If the above assertion is true,
			 * then the following if statement can never be true.
			 */
			if (iswrite && (rwtype == RW_READER)) {
				rwtype = RW_WRITER;
				if (!rw_tryupgrade(&ip->i_contents)) {
					rw_exit(&ip->i_contents);
					rw_enter(&ip->i_contents, rwtype);
					continue;
				}
			}
			offsetn = blkoff(fs, offset1);
			pplen = P2ROUNDUP(len, PAGESIZE);
			nbytes = MIN((pplen - curlen),
			    (fs->fs_bsize - offsetn));
			ASSERT(nbytes > 0);

			/*
			 * We may be reading or writing.
			 */
			DEBUGF((CE_CONT, "?ufs_rdwr_data: hole %llx - %lx\n",
			    offset1, (iolen - curlen)));

			if (iswrite) {
				printf("**WARNING: ignoring hole in write\n");
				error = ENOSPC;
			} else {
				fdb_add_hole(fdbp, offset1 - offset, nbytes);
			}
			offset1 += nbytes;
			curlen += nbytes;
			continue;

		}
		ASSERT(contig > 0);
		pplen = P2ROUNDUP(len, PAGESIZE);

		contig = MIN(contig, len - curlen);
		contig = P2ROUNDUP(contig, DEV_BSIZE);

		bp = fdb_iosetup(fdbp, offset1 - offset, contig, vnodep, flags);

		bp->b_edev = ip->i_dev;
		bp->b_dev = cmpdev(ip->i_dev);
		bp->b_blkno = bn;
		bp->b_file = ip->i_vnode;
		bp->b_offset = (offset_t)offset1;

		if (ufsvfsp->vfs_snapshot) {
			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
		} else {
			(void) bdev_strategy(bp);
		}
		io_started = 1;

		offset1 += contig;
		curlen += contig;
		if (iswrite)
			lwp_stat_update(LWP_STAT_OUBLK, 1);
		else
			lwp_stat_update(LWP_STAT_INBLK, 1);

		if ((flags & B_ASYNC) == 0) {
			error = biowait(bp);
			fdb_iodone(bp);
		}

		DEBUGF((CE_CONT, "?loop ufs_rdwr_data.. off %llx len %lx\n",
		    offset1, (iolen - curlen)));
	}

	DEBUGF((CE_CONT, "?ufs_rdwr_data: off %llx len %lx pages: %p ------\n",
	    offset1, (iolen - curlen), (void *)vnodep->v_pages));

	rw_exit(&ip->i_contents);
	rw_exit(&ip->i_ufsvfs->vfs_dqrwlock);

	if (flags & B_ASYNC) {
		/*
		 * Show that no more asynchronous IO will be added
		 */
		fdb_ioerrdone(fdbp, error);
	}
	if (ulp) {
		ufs_lockfs_end(ulp);
	}
	if (io_started && flags & B_ASYNC) {
		return (0);
	} else {
		return (error);
	}
}
Ejemplo n.º 13
0
int
priv_getbyname(const char *name, uint_t flag)
{
	int i;
	int wheld = 0;
	int len;
	char *p;

	if (flag != 0 && flag != PRIV_ALLOC)
		return (-EINVAL);

	if (strncasecmp(name, "priv_", 5) == 0)
		name += 5;

	rw_enter(&privinfo_lock, RW_READER);
rescan:
	for (i = 0; i < nprivs; i++)
		if (strcasecmp(priv_names[i], name) == 0) {
			rw_exit(&privinfo_lock);
			return (i);
		}


	if (!wheld) {
		if (!(flag & PRIV_ALLOC)) {
			rw_exit(&privinfo_lock);
			return (-EINVAL);
		}

		/* check length, validity and available space */
		len = strlen(name) + 1;

		if (len > PRIVNAME_MAX) {
			rw_exit(&privinfo_lock);
			return (-ENAMETOOLONG);
		}

		for (p = (char *)name; *p != '\0'; p++) {
			char c = *p;

			if (!((c >= 'A' && c <= 'Z') ||
			    (c >= 'a' && c <= 'z') ||
			    (c >= '0' && c <= '9') ||
			    c == '_')) {
				rw_exit(&privinfo_lock);
				return (-EINVAL);
			}
		}

		if (!rw_tryupgrade(&privinfo_lock)) {
			rw_exit(&privinfo_lock);
			rw_enter(&privinfo_lock, RW_WRITER);
			wheld = 1;
			/* Someone may have added our privilege */
			goto rescan;
		}
	}

	if (nprivs == MAX_PRIVILEGE || len + privbytes > maxprivbytes) {
		rw_exit(&privinfo_lock);
		return (-ENOMEM);
	}

	priv_names[i] = p = priv_str + privbytes;

	bcopy(name, p, len);

	/* make the priv_names[i] and privilege name globally visible */
	membar_producer();

	/* adjust priv count and bytes count */
	priv_ninfo->cnt = priv_info->priv_max = ++nprivs;
	privbytes += len;

	rw_exit(&privinfo_lock);
	return (i);
}
Ejemplo n.º 14
0
/*
 * Query domain table for a given domain.
 *
 * If domain isn't found it is added to AVL trees and
 * the results are pushed out to disk.
 */
int
zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
                        dmu_tx_t *tx)
{
    fuid_domain_t searchnode, *findnode;
    avl_index_t loc;
    krw_t rw = RW_READER;

    /*
     * If the dummy "nobody" domain then return an index of 0
     * to cause the created FUID to be a standard POSIX id
     * for the user nobody.
     */
    if (domain[0] == '\0') {
        *retdomain = nulldomain;
        return (0);
    }

    searchnode.f_ksid = ksid_lookupdomain(domain);
    if (retdomain) {
        *retdomain = searchnode.f_ksid->kd_name;
    }
    if (!zfsvfs->z_fuid_loaded)
        zfs_fuid_init(zfsvfs, tx);

retry:
    rw_enter(&zfsvfs->z_fuid_lock, rw);
    findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc);

    if (findnode) {
        rw_exit(&zfsvfs->z_fuid_lock);
        ksiddomain_rele(searchnode.f_ksid);
        return (findnode->f_idx);
    } else {
        fuid_domain_t *domnode;
        nvlist_t *nvp;
        nvlist_t **fuids;
        uint64_t retidx;
        size_t nvsize = 0;
        char *packed;
        dmu_buf_t *db;
        int i = 0;

        if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) {
            rw_exit(&zfsvfs->z_fuid_lock);
            rw = RW_WRITER;
            goto retry;
        }

        domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
        domnode->f_ksid = searchnode.f_ksid;

        retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1;

        avl_add(&zfsvfs->z_fuid_domain, domnode);
        avl_add(&zfsvfs->z_fuid_idx, domnode);
        /*
         * Now resync the on-disk nvlist.
         */
        VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);

        domnode = avl_first(&zfsvfs->z_fuid_domain);
        fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP);
        while (domnode) {
            VERIFY(nvlist_alloc(&fuids[i],
                                NV_UNIQUE_NAME, KM_SLEEP) == 0);
            VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
                                     domnode->f_idx) == 0);
            VERIFY(nvlist_add_uint64(fuids[i],
                                     FUID_OFFSET, 0) == 0);
            VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN,
                                     domnode->f_ksid->kd_name) == 0);
            domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode);
        }
        VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
                                       fuids, retidx) == 0);
        for (i = 0; i != retidx; i++)
            nvlist_free(fuids[i]);
        kmem_free(fuids, retidx * sizeof (void *));
        VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
        packed = kmem_alloc(nvsize, KM_SLEEP);
        VERIFY(nvlist_pack(nvp, &packed, &nvsize,
                           NV_ENCODE_XDR, KM_SLEEP) == 0);
        nvlist_free(nvp);
        zfsvfs->z_fuid_size = nvsize;
        dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
                  zfsvfs->z_fuid_size, packed, tx);
        kmem_free(packed, zfsvfs->z_fuid_size);
        VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
                                   FTAG, &db));
        dmu_buf_will_dirty(db, tx);
        *(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
        dmu_buf_rele(db, FTAG);

        rw_exit(&zfsvfs->z_fuid_lock);
        return (retidx);
    }
}
Ejemplo n.º 15
0
static void
devvt_cleandir(struct vnode *dvp, struct cred *cred)
{
	struct sdev_node *sdvp = VTOSDEV(dvp);
	struct sdev_node *dv, *next = NULL;
	int min, cnt;
	char found = 0;

	mutex_enter(&vc_lock);
	cnt = VC_INSTANCES_COUNT;
	mutex_exit(&vc_lock);

/* We have to fool warlock this way, otherwise it will complain */
#ifndef	__lock_lint
	if (rw_tryupgrade(&sdvp->sdev_contents) == NULL) {
		rw_exit(&sdvp->sdev_contents);
		rw_enter(&sdvp->sdev_contents, RW_WRITER);
	}
#else
	rw_enter(&sdvp->sdev_contents, RW_WRITER);
#endif

	/* 1.  prune invalid nodes and rebuild stale symlinks */
	devvt_prunedir(sdvp);

	/* 2. create missing nodes */
	for (min = 0; min < cnt; min++) {
		char nm[16];

		if (vt_minor_valid(min) == B_FALSE)
			continue;

		(void) snprintf(nm, sizeof (nm), "%d", min);
		found = 0;
		for (dv = SDEV_FIRST_ENTRY(sdvp); dv; dv = next) {
			next = SDEV_NEXT_ENTRY(sdvp, dv);

			/* validate only ready nodes */
			if (dv->sdev_state != SDEV_READY)
				continue;
			if (strcmp(nm, dv->sdev_name) == 0) {
				found = 1;
				break;
			}
		}
		if (!found) {
			devvt_create_snode(sdvp, nm, cred, SDEV_VATTR);
		}
	}

	/* 3. create active link node and console user link node */
	found = 0;
	for (dv = SDEV_FIRST_ENTRY(sdvp); dv; dv = next) {
		next = SDEV_NEXT_ENTRY(sdvp, dv);

		/* validate only ready nodes */
		if (dv->sdev_state != SDEV_READY)
			continue;
		if ((strcmp(dv->sdev_name, DEVVT_ACTIVE_NAME) == NULL))
			found |= 0x01;
		if ((strcmp(dv->sdev_name, DEVVT_CONSUSER_NAME) == NULL))
			found |= 0x02;

		if ((found & 0x01) && (found & 0x02))
			break;
	}
	if (!(found & 0x01))
		devvt_create_snode(sdvp, DEVVT_ACTIVE_NAME, cred, SDEV_VLINK);
	if (!(found & 0x02))
		devvt_create_snode(sdvp, DEVVT_CONSUSER_NAME, cred, SDEV_VLINK);

#ifndef	__lock_lint
	rw_downgrade(&sdvp->sdev_contents);
#else
	rw_exit(&sdvp->sdev_contents);
#endif
}
Ejemplo n.º 16
0
/*
 * This is the predictive prefetch entry point.  It associates dnode access
 * specified with blkid and nblks arguments with prefetch stream, predicts
 * further accesses based on that stats and initiates speculative prefetch.
 * fetch_data argument specifies whether actual data blocks should be fetched:
 *   FALSE -- prefetch only indirect blocks for predicted data blocks;
 *   TRUE -- prefetch predicted data blocks plus following indirect blocks.
 */
void
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
{
	zstream_t *zs;
	int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
	int64_t pf_ahead_blks, max_blks;
	int epbs, max_dist_blks, pf_nblks, ipf_nblks;
	uint64_t end_of_access_blkid = blkid + nblks;
	spa_t *spa = zf->zf_dnode->dn_objset->os_spa;

	if (zfs_prefetch_disable)
		return;

	/*
	 * If we haven't yet loaded the indirect vdevs' mappings, we
	 * can only read from blocks that we carefully ensure are on
	 * concrete vdevs (or previously-loaded indirect vdevs).  So we
	 * can't allow the predictive prefetcher to attempt reads of other
	 * blocks (e.g. of the MOS's dnode obejct).
	 */
	if (!spa_indirect_vdevs_loaded(spa))
		return;

	/*
	 * As a fast path for small (single-block) files, ignore access
	 * to the first block.
	 */
	if (blkid == 0)
		return;

	rw_enter(&zf->zf_rwlock, RW_READER);

	/*
	 * Find matching prefetch stream.  Depending on whether the accesses
	 * are block-aligned, first block of the new access may either follow
	 * the last block of the previous access, or be equal to it.
	 */
	for (zs = list_head(&zf->zf_stream); zs != NULL;
	    zs = list_next(&zf->zf_stream, zs)) {
		if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) {
			mutex_enter(&zs->zs_lock);
			/*
			 * zs_blkid could have changed before we
			 * acquired zs_lock; re-check them here.
			 */
			if (blkid == zs->zs_blkid) {
				break;
			} else if (blkid + 1 == zs->zs_blkid) {
				blkid++;
				nblks--;
				if (nblks == 0) {
					/* Already prefetched this before. */
					mutex_exit(&zs->zs_lock);
					rw_exit(&zf->zf_rwlock);
					return;
				}
				break;
			}
			mutex_exit(&zs->zs_lock);
		}
	}

	if (zs == NULL) {
		/*
		 * This access is not part of any existing stream.  Create
		 * a new stream for it.
		 */
		ZFETCHSTAT_BUMP(zfetchstat_misses);
		if (rw_tryupgrade(&zf->zf_rwlock))
			dmu_zfetch_stream_create(zf, end_of_access_blkid);
		rw_exit(&zf->zf_rwlock);
		return;
	}

	/*
	 * This access was to a block that we issued a prefetch for on
	 * behalf of this stream. Issue further prefetches for this stream.
	 *
	 * Normally, we start prefetching where we stopped
	 * prefetching last (zs_pf_blkid).  But when we get our first
	 * hit on this stream, zs_pf_blkid == zs_blkid, we don't
	 * want to prefetch the block we just accessed.  In this case,
	 * start just after the block we just accessed.
	 */
	pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);

	/*
	 * Double our amount of prefetched data, but don't let the
	 * prefetch get further ahead than zfetch_max_distance.
	 */
	if (fetch_data) {
		max_dist_blks =
		    zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
		/*
		 * Previously, we were (zs_pf_blkid - blkid) ahead.  We
		 * want to now be double that, so read that amount again,
		 * plus the amount we are catching up by (i.e. the amount
		 * read just now).
		 */
		pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
		max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
		pf_nblks = MIN(pf_ahead_blks, max_blks);
	} else {
Ejemplo n.º 17
0
/*
 * If DV_BUILD is set, we call into nexus driver to do a BUS_CONFIG_ALL.
 * Otherwise, simply return cached dv_node's. Hotplug code always call
 * devfs_clean() to invalid the dv_node cache.
 */
static int
devfs_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp)
{
	struct dv_node *ddv, *dv;
	struct dirent64 *de, *bufp;
	offset_t diroff;
	offset_t	soff;
	size_t reclen, movesz;
	int error;
	struct vattr va;
	size_t bufsz;

	ddv = VTODV(dvp);
	dcmn_err2(("devfs_readdir %s: offset %lld len %ld\n",
	    ddv->dv_name, uiop->uio_loffset, uiop->uio_iov->iov_len));
	ASSERT(ddv->dv_attr || ddv->dv_attrvp);
	ASSERT(RW_READ_HELD(&ddv->dv_contents));

	if (uiop->uio_loffset >= MAXOFF_T) {
		if (eofp)
			*eofp = 1;
		return (0);
	}

	if (uiop->uio_iovcnt != 1)
		return (EINVAL);

	if (dvp->v_type != VDIR)
		return (ENOTDIR);

	/* Load the initial contents */
	if (ddv->dv_flags & DV_BUILD) {
		if (!rw_tryupgrade(&ddv->dv_contents)) {
			rw_exit(&ddv->dv_contents);
			rw_enter(&ddv->dv_contents, RW_WRITER);
		}

		/* recheck and fill */
		if (ddv->dv_flags & DV_BUILD)
			dv_filldir(ddv);

		rw_downgrade(&ddv->dv_contents);
	}

	soff = uiop->uio_offset;
	bufsz = uiop->uio_iov->iov_len;
	de = bufp = kmem_alloc(bufsz, KM_SLEEP);
	movesz = 0;
	dv = (struct dv_node *)-1;

	/*
	 * Move as many entries into the uio structure as it will take.
	 * Special case "." and "..".
	 */
	diroff = 0;
	if (soff == 0) {				/* . */
		reclen = DIRENT64_RECLEN(strlen("."));
		if ((movesz + reclen) > bufsz)
			goto full;
		de->d_ino = (ino64_t)ddv->dv_ino;
		de->d_off = (off64_t)diroff + 1;
		de->d_reclen = (ushort_t)reclen;

		/* use strncpy(9f) to zero out uninitialized bytes */

		(void) strncpy(de->d_name, ".", DIRENT64_NAMELEN(reclen));
		movesz += reclen;
		de = (dirent64_t *)((char *)de + reclen);
		dcmn_err3(("devfs_readdir: A: diroff %lld, soff %lld: '%s' "
		    "reclen %lu\n", diroff, soff, ".", reclen));
	}

	diroff++;
	if (soff <= 1) {				/* .. */
		reclen = DIRENT64_RECLEN(strlen(".."));
		if ((movesz + reclen) > bufsz)
			goto full;
		de->d_ino = (ino64_t)ddv->dv_dotdot->dv_ino;
		de->d_off = (off64_t)diroff + 1;
		de->d_reclen = (ushort_t)reclen;

		/* use strncpy(9f) to zero out uninitialized bytes */

		(void) strncpy(de->d_name, "..", DIRENT64_NAMELEN(reclen));
		movesz += reclen;
		de = (dirent64_t *)((char *)de + reclen);
		dcmn_err3(("devfs_readdir: B: diroff %lld, soff %lld: '%s' "
		    "reclen %lu\n", diroff, soff, "..", reclen));
	}

	diroff++;
	for (dv = ddv->dv_dot; dv; dv = dv->dv_next, diroff++) {
		/*
		 * although DDM_INTERNAL_PATH minor nodes are skipped for
		 * readdirs outside the kernel, they still occupy directory
		 * offsets
		 */
		if (diroff < soff ||
		    ((dv->dv_flags & DV_INTERNAL) && (cred != kcred)))
			continue;

		reclen = DIRENT64_RECLEN(strlen(dv->dv_name));
		if ((movesz + reclen) > bufsz) {
			dcmn_err3(("devfs_readdir: C: diroff "
			    "%lld, soff %lld: '%s' reclen %lu\n",
			    diroff, soff, dv->dv_name, reclen));
			goto full;
		}
		de->d_ino = (ino64_t)dv->dv_ino;
		de->d_off = (off64_t)diroff + 1;
		de->d_reclen = (ushort_t)reclen;

		/* use strncpy(9f) to zero out uninitialized bytes */

		ASSERT(strlen(dv->dv_name) + 1 <=
		    DIRENT64_NAMELEN(reclen));
		(void) strncpy(de->d_name, dv->dv_name,
		    DIRENT64_NAMELEN(reclen));

		movesz += reclen;
		de = (dirent64_t *)((char *)de + reclen);
		dcmn_err4(("devfs_readdir: D: diroff "
		    "%lld, soff %lld: '%s' reclen %lu\n", diroff, soff,
		    dv->dv_name, reclen));
	}

	/* the buffer is full, or we exhausted everything */
full:	dcmn_err3(("devfs_readdir: moving %lu bytes: "
	    "diroff %lld, soff %lld, dv %p\n",
	    movesz, diroff, soff, (void *)dv));

	if ((movesz == 0) && dv)
		error = EINVAL;		/* cannot be represented */
	else {
		error = uiomove(bufp, movesz, UIO_READ, uiop);
		if (error == 0) {
			if (eofp)
				*eofp = dv ? 0 : 1;
			uiop->uio_offset = diroff;
		}

		va.va_mask = AT_ATIME;
		gethrestime(&va.va_atime);
		rw_exit(&ddv->dv_contents);
		(void) devfs_setattr(dvp, &va, 0, cred, NULL);
		rw_enter(&ddv->dv_contents, RW_READER);
	}

	kmem_free(bufp, bufsz);
	return (error);
}