static int null_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct vnode *lvp, *ldvp; struct ucred *cred = ap->a_cred; int error, locked; if (vp->v_type == VDIR) return (vop_stdvptocnp(ap)); locked = VOP_ISLOCKED(vp); lvp = NULLVPTOLOWERVP(vp); vhold(lvp); VOP_UNLOCK(vp, 0); /* vp is held by vn_vptocnp_locked that called us */ ldvp = lvp; error = vn_vptocnp(&ldvp, cred, ap->a_buf, ap->a_buflen); vdrop(lvp); if (error != 0) { vn_lock(vp, locked | LK_RETRY); return (ENOENT); } /* * Exclusive lock is required by insmntque1 call in * null_nodeget() */ error = vn_lock(ldvp, LK_EXCLUSIVE); if (error != 0) { vn_lock(vp, locked | LK_RETRY); vdrop(ldvp); return (ENOENT); } vref(ldvp); vdrop(ldvp); error = null_nodeget(vp->v_mount, ldvp, dvp); if (error == 0) { #ifdef DIAGNOSTIC NULLVPTOLOWERVP(*dvp); #endif vhold(*dvp); vput(*dvp); } else vput(ldvp); vn_lock(vp, locked | LK_RETRY); return (error); }
/*ARGSUSED*/ static void znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) { #if 1 /* XXXPJD: From OpenSolaris. */ /* * We should never drop all dbuf refs without first clearing * the eviction callback. */ panic("evicting znode %p\n", user_ptr); #else /* XXXPJD */ znode_t *zp = user_ptr; vnode_t *vp; mutex_enter(&zp->z_lock); zp->z_dbuf = NULL; vp = ZTOV(zp); if (vp == NULL) { mutex_exit(&zp->z_lock); zfs_znode_free(zp); } else if (vp->v_count == 0) { zp->z_vnode = NULL; vhold(vp); mutex_exit(&zp->z_lock); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); vrecycle(vp, curthread); VOP_UNLOCK(vp, 0, curthread); vdrop(vp); zfs_znode_free(zp); } else { mutex_exit(&zp->z_lock); } #endif }
/* * Purge the cache of dead entries * * This is extremely inefficient due to the fact that vgone() not only * indirectly modifies the vnode cache, but may also sleep. We can * neither hold pfs_vncache_mutex across a vgone() call, nor make any * assumptions about the state of the cache after vgone() returns. In * consequence, we must start over after every vgone() call, and keep * trying until we manage to traverse the entire cache. * * The only way to improve this situation is to change the data structure * used to implement the cache. */ static void pfs_purge_locked(struct pfs_node *pn, bool force) { struct pfs_vdata *pvd; struct vnode *vnp; mtx_assert(&pfs_vncache_mutex, MA_OWNED); pvd = pfs_vncache; while (pvd != NULL) { if (force || pvd->pvd_dead || (pn != NULL && pvd->pvd_pn == pn)) { vnp = pvd->pvd_vnode; vhold(vnp); mtx_unlock(&pfs_vncache_mutex); VOP_LOCK(vnp, LK_EXCLUSIVE); vgone(vnp); VOP_UNLOCK(vnp, 0); mtx_lock(&pfs_vncache_mutex); vdrop(vnp); pvd = pfs_vncache; } else { pvd = pvd->pvd_next; } } }
/* * This opens /dev/tty. Because multiple opens of /dev/tty only * generate a single open to the actual tty, the file modes are * locked to FREAD|FWRITE. */ static int cttyopen(struct dev_open_args *ap) { struct proc *p = curproc; struct vnode *ttyvp; int error; KKASSERT(p); retry: if ((ttyvp = cttyvp(p)) == NULL) return (ENXIO); if (ttyvp->v_flag & VCTTYISOPEN) return (0); /* * Messy interlock, don't let the vnode go away while we try to * lock it and check for race after we might have blocked. */ vhold(ttyvp); vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY); if (ttyvp != cttyvp(p) || (ttyvp->v_flag & VCTTYISOPEN)) { kprintf("Warning: cttyopen: race avoided\n"); vn_unlock(ttyvp); vdrop(ttyvp); goto retry; } vsetflags(ttyvp, VCTTYISOPEN); error = VOP_OPEN(ttyvp, FREAD|FWRITE, ap->a_cred, NULL); if (error) vclrflags(ttyvp, VCTTYISOPEN); vn_unlock(ttyvp); vdrop(ttyvp); return(error); }
/* * Read a symbolic link */ static int pfs_readlink(struct vop_readlink_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct uio *uio = va->a_uio; struct proc *proc = NULL; struct thread *td = curthread; char buf[PATH_MAX]; struct sbuf sb; int error, locked; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); if (vn->v_type != VLNK) PFS_RETURN (EINVAL); KASSERT_PN_IS_LINK(pn); if (pn->pn_fill == NULL) PFS_RETURN (EIO); if (pvd->pvd_pid != NO_PID) { if ((proc = pfind(pvd->pvd_pid)) == NULL) PFS_RETURN (EIO); if (proc->p_flag & P_WEXIT) { PROC_UNLOCK(proc); PFS_RETURN (EIO); } _PHOLD(proc); PROC_UNLOCK(proc); } vhold(vn); locked = VOP_ISLOCKED(vn, td); VOP_UNLOCK(vn, 0, td); /* sbuf_new() can't fail with a static buffer */ sbuf_new(&sb, buf, sizeof buf, 0); error = pn_fill(td, proc, pn, &sb, NULL); if (proc != NULL) PRELE(proc); vn_lock(vn, locked | LK_RETRY, td); vdrop(vn); if (error) { sbuf_delete(&sb); PFS_RETURN (error); } sbuf_finish(&sb); error = uiomove_frombuf(sbuf_data(&sb), sbuf_len(&sb), uio); sbuf_delete(&sb); PFS_RETURN (error); }
/** * ntfs_usnjrnl_stamp - stamp the transaction log ($UsnJrnl) on an ntfs volume * @vol: ntfs volume on which to stamp the transaction log * * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return 0 * on success and errno on error. * * This function assumes that the transaction log has already been loaded and * consistency checked by a call to ntfs_vfsops.c::ntfs_usnjrnl_load(). */ errno_t ntfs_usnjrnl_stamp(ntfs_volume *vol) { ntfs_debug("Entering."); if (!NVolUsnJrnlStamped(vol)) { sle64 j_size, stamp; upl_t upl; upl_page_info_array_t pl; USN_HEADER *uh; ntfs_inode *max_ni; errno_t err; mtx_lock_spin(&vol->usnjrnl_j_ni->size_lock); j_size = vol->usnjrnl_j_ni->data_size; mtx_unlock_spin(&vol->usnjrnl_j_ni->size_lock); max_ni = vol->usnjrnl_max_ni; /* * FIXME: Next If statement always false because of * replacing vnode_get() with vhold() */ vhold(max_ni->vn); if (0) { ntfs_error(vol->mp, "Failed to get vnode for " "$UsnJrnl/$DATA/$Max."); return err; } sx_slock(&max_ni->lock); err = ntfs_page_map(max_ni, 0, &upl, &pl, (u8**)&uh, TRUE); if (err) { ntfs_error(vol->mp, "Failed to read from " "$UsnJrnl/$DATA/$Max attribute."); vdrop(max_ni->vn); return err; } stamp = ntfs_current_time(); ntfs_debug("Stamping transaction log ($UsnJrnl): old " "journal_id 0x%llx, old lowest_valid_usn " "0x%llx, new journal_id 0x%llx, new " "lowest_valid_usn 0x%llx.", (unsigned long long) sle64_to_cpu(uh->journal_id), (unsigned long long) sle64_to_cpu(uh->lowest_valid_usn), (unsigned long long)sle64_to_cpu(stamp), (unsigned long long)j_size); uh->lowest_valid_usn = cpu_to_sle64(j_size); uh->journal_id = stamp; ntfs_page_unmap(max_ni, upl, pl, TRUE); sx_sunlock(&max_ni->lock); vdrop(max_ni->vn); /* Set the flag so we do not have to do it again on remount. */ NVolSetUsnJrnlStamped(vol); // TODO: Should we mark any times on the base inode $UsnJrnl // for update here? } ntfs_debug("Done."); return 0; }
/* * This opens /dev/tty. Because multiple opens of /dev/tty only * generate a single open to the actual tty, the file modes are * locked to FREAD|FWRITE. */ static int cttyopen(struct dev_open_args *ap) { struct proc *p = curproc; struct vnode *ttyvp; int error; KKASSERT(p); retry: if ((ttyvp = cttyvp(p)) == NULL) return (ENXIO); if (ttyvp->v_flag & VCTTYISOPEN) return (0); /* * Messy interlock, don't let the vnode go away while we try to * lock it and check for race after we might have blocked. * * WARNING! The device open (devfs_spec_open()) temporarily * releases the vnode lock on ttyvp when issuing the * dev_dopen(), which means that the VCTTYISOPEn flag * can race during the VOP_OPEN(). * * If something does race we have to undo our potentially * extra open. */ vhold(ttyvp); vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY); if (ttyvp != cttyvp(p) || (ttyvp->v_flag & VCTTYISOPEN)) { kprintf("Warning: cttyopen: race-1 avoided\n"); vn_unlock(ttyvp); vdrop(ttyvp); goto retry; } error = VOP_OPEN(ttyvp, FREAD|FWRITE, ap->a_cred, NULL); /* * Race against ctty close or change. This case has been validated * and occurs every so often during synth builds. */ if (ttyvp != cttyvp(p) || (ttyvp->v_flag & VCTTYISOPEN)) { if (error == 0) VOP_CLOSE(ttyvp, FREAD|FWRITE, NULL); vn_unlock(ttyvp); vdrop(ttyvp); goto retry; } if (error == 0) vsetflags(ttyvp, VCTTYISOPEN); vn_unlock(ttyvp); vdrop(ttyvp); return(error); }
static int udf_write_logvol_dscr_seq(struct udf_strat_args *args) { union dscrptr *dscr = args->dscr; struct udf_mount *ump = args->ump; struct udf_node *udf_node = args->udf_node; struct long_ad *icb = args->icb; int waitfor = args->waitfor; uint32_t logsectornr, sectornr, dummy; int error, vpart; /* * we have to decide if we write it out sequential or at its fixed * position by examining the partition its (to be) written on. */ vpart = udf_rw16(udf_node->loc.loc.part_num); logsectornr = udf_rw32(icb->loc.lb_num); sectornr = 0; if (ump->vtop_tp[vpart] != UDF_VTOP_TYPE_VIRT) { error = udf_translate_vtop(ump, icb, §ornr, &dummy); if (error) goto out; } /* add reference to the vnode to prevent recycling */ vhold(udf_node->vnode); if (waitfor) { DPRINTF(WRITE, ("udf_write_logvol_dscr: sync write\n")); error = udf_write_phys_dscr_sync(ump, udf_node, UDF_C_NODE, dscr, sectornr, logsectornr); } else { DPRINTF(WRITE, ("udf_write_logvol_dscr: no wait, async write\n")); error = udf_write_phys_dscr_async(ump, udf_node, UDF_C_NODE, dscr, sectornr, logsectornr, udf_wr_nodedscr_callback); /* will be UNLOCKED in call back */ return error; } holdrele(udf_node->vnode); out: udf_node->outstanding_nodedscr--; if (udf_node->outstanding_nodedscr == 0) { UDF_UNLOCK_NODE(udf_node, 0); wakeup(&udf_node->outstanding_nodedscr); } return error; }
void vnode_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; struct mount *mp; vm_offset_t inc; VM_OBJECT_WLOCK(object); /* * First, recheck the object type to account for the race when * the vnode is reclaimed. */ if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } /* * Optimize for the case when writemappings is not going to * zero. */ inc = end - start; if (object->un_pager.vnp.writemappings != inc) { object->un_pager.vnp.writemappings -= inc; VM_OBJECT_WUNLOCK(object); return; } vp = object->handle; vhold(vp); VM_OBJECT_WUNLOCK(object); mp = NULL; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * Decrement the object's writemappings, by swapping the start * and end arguments for vnode_pager_update_writecount(). If * there was not a race with vnode reclaimation, then the * vnode's v_writecount is decremented. */ vnode_pager_update_writecount(object, end, start); VOP_UNLOCK(vp, 0); vdrop(vp); if (mp != NULL) vn_finished_write(mp); }
/* * Filler function for proc/pid/self */ int procfs_doprocfile(PFS_FILL_ARGS) { char *fullpath; char *freepath; struct vnode *textvp; int error; freepath = NULL; PROC_LOCK(p); textvp = p->p_textvp; vhold(textvp); PROC_UNLOCK(p); error = vn_fullpath(td, textvp, &fullpath, &freepath); vdrop(textvp); if (error == 0) sbuf_printf(sb, "%s", fullpath); if (freepath != NULL) free(freepath, M_TEMP); return (error); }
static int ufs_lookup_upgrade_lock(struct vnode *vp) { int error; ASSERT_VOP_LOCKED(vp, __FUNCTION__); if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) return (0); error = 0; /* * Upgrade vnode lock, since getinoquota() * requires exclusive lock to modify inode. */ vhold(vp); vn_lock(vp, LK_UPGRADE | LK_RETRY); VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) error = ENOENT; vdropl(vp); return (error); }
/* * Read from a file */ static int pfs_read(struct vop_read_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct uio *uio = va->a_uio; struct proc *proc; struct sbuf *sb = NULL; int error, locked; unsigned int buflen, offset, resid; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); if (vn->v_type != VREG) PFS_RETURN (EINVAL); KASSERT_PN_IS_FILE(pn); if (!(pn->pn_flags & PFS_RD)) PFS_RETURN (EBADF); if (pn->pn_fill == NULL) PFS_RETURN (EIO); /* * This is necessary because either process' privileges may * have changed since the open() call. */ if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc)) PFS_RETURN (EIO); if (proc != NULL) { _PHOLD(proc); PROC_UNLOCK(proc); } vhold(vn); locked = VOP_ISLOCKED(vn, curthread); VOP_UNLOCK(vn, 0, curthread); if (pn->pn_flags & PFS_RAWRD) { PFS_TRACE(("%lu resid", (unsigned long)uio->uio_resid)); error = pn_fill(curthread, proc, pn, NULL, uio); PFS_TRACE(("%lu resid", (unsigned long)uio->uio_resid)); goto ret; } /* beaucoup sanity checks so we don't ask for bogus allocation */ if (uio->uio_offset < 0 || uio->uio_resid < 0 || (offset = uio->uio_offset) != uio->uio_offset || (resid = uio->uio_resid) != uio->uio_resid || (buflen = offset + resid + 1) < offset || buflen > INT_MAX) { if (proc != NULL) PRELE(proc); error = EINVAL; goto ret; } if (buflen > MAXPHYS + 1) { error = EIO; goto ret; } sb = sbuf_new(sb, NULL, buflen, 0); if (sb == NULL) { error = EIO; goto ret; } error = pn_fill(curthread, proc, pn, sb, uio); if (error) { sbuf_delete(sb); goto ret; } sbuf_finish(sb); error = uiomove_frombuf(sbuf_data(sb), sbuf_len(sb), uio); sbuf_delete(sb); ret: vn_lock(vn, locked | LK_RETRY, curthread); vdrop(vn); if (proc != NULL) PRELE(proc); PFS_RETURN (error); }
/* * Allocates a new vnode for the node node or returns a new reference to * an existing one if the node had already a vnode referencing it. The * resulting locked vnode is returned in *vpp. * * Returns zero on success or an appropriate error code on failure. * * The caller must ensure that node cannot go away (usually by holding * the related directory entry). * * If dnode is non-NULL this routine avoids deadlocking against it but * can return EAGAIN. Caller must try again. The dnode lock will cycle * in this case, it remains locked on return in all cases. dnode must * be shared-locked. */ int tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *dnode, struct tmpfs_node *node, int lkflag, struct vnode **vpp) { int error = 0; struct vnode *vp; loop: /* * Interlocked extraction from node. This can race many things. * We have to get a soft reference on the vnode while we hold * the node locked, then acquire it properly and check for races. */ TMPFS_NODE_LOCK(node); if ((vp = node->tn_vnode) != NULL) { KKASSERT((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0); vhold(vp); TMPFS_NODE_UNLOCK(node); if (dnode) { /* * Special-case handling to avoid deadlocking against * dnode. This case has been validated and occurs * every so often during synth builds. */ if (vget(vp, (lkflag & ~LK_RETRY) | LK_NOWAIT | LK_EXCLUSIVE) != 0) { TMPFS_NODE_UNLOCK(dnode); if (vget(vp, (lkflag & ~LK_RETRY) | LK_SLEEPFAIL | LK_EXCLUSIVE) == 0) { vn_unlock(vp); } vdrop(vp); TMPFS_NODE_LOCK_SH(dnode); return EAGAIN; } } else { /* * Normal path */ if (vget(vp, lkflag | LK_EXCLUSIVE) != 0) { vdrop(vp); goto loop; } } if (node->tn_vnode != vp) { vput(vp); vdrop(vp); goto loop; } vdrop(vp); goto out; } /* vp is NULL */ /* * This should never happen. */ if (node->tn_vpstate & TMPFS_VNODE_DOOMED) { TMPFS_NODE_UNLOCK(node); error = ENOENT; goto out; } /* * Interlock against other calls to tmpfs_alloc_vp() trying to * allocate and assign a vp to node. */ if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { node->tn_vpstate |= TMPFS_VNODE_WANT; error = tsleep(&node->tn_vpstate, PINTERLOCKED | PCATCH, "tmpfs_alloc_vp", 0); TMPFS_NODE_UNLOCK(node); if (error) return error; goto loop; } node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; TMPFS_NODE_UNLOCK(node); /* * Allocate a new vnode (may block). The ALLOCATING flag should * prevent a race against someone else assigning node->tn_vnode. */ error = getnewvnode(VT_TMPFS, mp, &vp, VLKTIMEOUT, LK_CANRECURSE); if (error != 0) goto unlock; KKASSERT(node->tn_vnode == NULL); KKASSERT(vp != NULL); vp->v_data = node; vp->v_type = node->tn_type; /* Type-specific initialization. */ switch (node->tn_type) { case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VSOCK: break; case VREG: /* * VMIO is mandatory. Tmpfs also supports KVABIO * for its tmpfs_strategy(). */ vsetflags(vp, VKVABIO); vinitvmio(vp, node->tn_size, TMPFS_BLKSIZE, -1); break; case VLNK: break; case VFIFO: vp->v_ops = &mp->mnt_vn_fifo_ops; break; case VDIR: break; default: panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); } unlock: TMPFS_NODE_LOCK(node); KKASSERT(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; node->tn_vnode = vp; if (node->tn_vpstate & TMPFS_VNODE_WANT) { node->tn_vpstate &= ~TMPFS_VNODE_WANT; TMPFS_NODE_UNLOCK(node); wakeup(&node->tn_vpstate); } else { TMPFS_NODE_UNLOCK(node); } out: *vpp = vp; KKASSERT(IFF(error == 0, *vpp != NULL && vn_islocked(*vpp))); return error; }
/* * Do a generic nlookup. Note that the passed nd is not nlookup_done()'d * on return, even if an error occurs. If no error occurs or NLC_CREATE * is flagged and ENOENT is returned, then the returned nl_nch is always * referenced and locked exclusively. * * WARNING: For any general error other than ENOENT w/NLC_CREATE, the * the resulting nl_nch may or may not be locked and if locked * might be locked either shared or exclusive. * * Intermediate directory elements, including the current directory, require * execute (search) permission. nlookup does not examine the access * permissions on the returned element. * * If NLC_CREATE is set the last directory must allow node creation, * and an error code of 0 will be returned for a non-existant * target (not ENOENT). * * If NLC_RENAME_DST is set the last directory mut allow node deletion, * plus the sticky check is made, and an error code of 0 will be returned * for a non-existant target (not ENOENT). * * If NLC_DELETE is set the last directory mut allow node deletion, * plus the sticky check is made. * * If NLC_REFDVP is set nd->nl_dvp will be set to the directory vnode * of the returned entry. The vnode will be referenced, but not locked, * and will be released by nlookup_done() along with everything else. * * NOTE: As an optimization we attempt to obtain a shared namecache lock * on any intermediate elements. On success, the returned element * is ALWAYS locked exclusively. */ int nlookup(struct nlookupdata *nd) { globaldata_t gd = mycpu; struct nlcomponent nlc; struct nchandle nch; struct nchandle par; struct nchandle nctmp; struct mount *mp; struct vnode *hvp; /* hold to prevent recyclement */ int wasdotordotdot; char *ptr; char *nptr; int error; int len; int dflags; int hit = 1; int saveflag = nd->nl_flags & ~NLC_NCDIR; boolean_t doretry = FALSE; boolean_t inretry = FALSE; nlookup_start: #ifdef KTRACE if (KTRPOINT(nd->nl_td, KTR_NAMEI)) ktrnamei(nd->nl_td->td_lwp, nd->nl_path); #endif bzero(&nlc, sizeof(nlc)); /* * Setup for the loop. The current working namecache element is * always at least referenced. We lock it as required, but always * return a locked, resolved namecache entry. */ nd->nl_loopcnt = 0; if (nd->nl_dvp) { vrele(nd->nl_dvp); nd->nl_dvp = NULL; } ptr = nd->nl_path; /* * Loop on the path components. At the top of the loop nd->nl_nch * is ref'd and unlocked and represents our current position. */ for (;;) { /* * Make sure nl_nch is locked so we can access the vnode, resolution * state, etc. */ if ((nd->nl_flags & NLC_NCPISLOCKED) == 0) { nd->nl_flags |= NLC_NCPISLOCKED; cache_lock_maybe_shared(&nd->nl_nch, wantsexcllock(nd, ptr)); } /* * Check if the root directory should replace the current * directory. This is done at the start of a translation * or after a symbolic link has been found. In other cases * ptr will never be pointing at a '/'. */ if (*ptr == '/') { do { ++ptr; } while (*ptr == '/'); cache_unlock(&nd->nl_nch); cache_get_maybe_shared(&nd->nl_rootnch, &nch, wantsexcllock(nd, ptr)); if (nd->nl_flags & NLC_NCDIR) { cache_drop_ncdir(&nd->nl_nch); nd->nl_flags &= ~NLC_NCDIR; } else { cache_drop(&nd->nl_nch); } nd->nl_nch = nch; /* remains locked */ /* * Fast-track termination. There is no parent directory of * the root in the same mount from the point of view of * the caller so return EACCES if NLC_REFDVP is specified, * and EEXIST if NLC_CREATE is also specified. * e.g. 'rmdir /' or 'mkdir /' are not allowed. */ if (*ptr == 0) { if (nd->nl_flags & NLC_REFDVP) error = (nd->nl_flags & NLC_CREATE) ? EEXIST : EACCES; else error = 0; break; } continue; } /* * Pre-calculate next path component so we can check whether the * current component directory is the last directory in the path * or not. */ for (nptr = ptr; *nptr && *nptr != '/'; ++nptr) ; /* * Check directory search permissions (nd->nl_nch is locked & refd). * This will load dflags to obtain directory-special permissions to * be checked along with the last component. * * We only need to pass-in &dflags for the second-to-last component. * Optimize by passing-in NULL for any prior components, which may * allow the code to bypass the naccess() call. */ dflags = 0; if (*nptr == '/') error = naccess(&nd->nl_nch, NLC_EXEC, nd->nl_cred, NULL); else error = naccess(&nd->nl_nch, NLC_EXEC, nd->nl_cred, &dflags); if (error) { if (keeperror(nd, error)) break; error = 0; } /* * Extract the next (or last) path component. Path components are * limited to 255 characters. */ nlc.nlc_nameptr = ptr; nlc.nlc_namelen = nptr - ptr; ptr = nptr; if (nlc.nlc_namelen >= 256) { error = ENAMETOOLONG; break; } /* * Lookup the path component in the cache, creating an unresolved * entry if necessary. We have to handle "." and ".." as special * cases. * * When handling ".." we have to detect a traversal back through a * mount point. If we are at the root, ".." just returns the root. * * When handling "." or ".." we also have to recalculate dflags * since our dflags will be for some sub-directory instead of the * parent dir. * * This subsection returns a locked, refd 'nch' unless it errors out, * and an unlocked but still ref'd nd->nl_nch. * * The namecache topology is not allowed to be disconnected, so * encountering a NULL parent will generate EINVAL. This typically * occurs when a directory is removed out from under a process. * * WARNING! The unlocking of nd->nl_nch is sensitive code. */ KKASSERT(nd->nl_flags & NLC_NCPISLOCKED); if (nlc.nlc_namelen == 1 && nlc.nlc_nameptr[0] == '.') { cache_unlock(&nd->nl_nch); nd->nl_flags &= ~NLC_NCPISLOCKED; cache_get_maybe_shared(&nd->nl_nch, &nch, wantsexcllock(nd, ptr)); wasdotordotdot = 1; } else if (nlc.nlc_namelen == 2 && nlc.nlc_nameptr[0] == '.' && nlc.nlc_nameptr[1] == '.') { if (nd->nl_nch.mount == nd->nl_rootnch.mount && nd->nl_nch.ncp == nd->nl_rootnch.ncp ) { /* * ".." at the root returns the root */ cache_unlock(&nd->nl_nch); nd->nl_flags &= ~NLC_NCPISLOCKED; cache_get_maybe_shared(&nd->nl_nch, &nch, wantsexcllock(nd, ptr)); } else { /* * Locate the parent ncp. If we are at the root of a * filesystem mount we have to skip to the mounted-on * point in the underlying filesystem. * * Expect the parent to always be good since the * mountpoint doesn't go away. XXX hack. cache_get() * requires the ncp to already have a ref as a safety. * * However, a process which has been broken out of a chroot * will wind up with a NULL parent if it tries to '..' above * the real root, deal with the case. Note that this does * not protect us from a jail breakout, it just stops a panic * if the jail-broken process tries to '..' past the real * root. */ nctmp = nd->nl_nch; while (nctmp.ncp == nctmp.mount->mnt_ncmountpt.ncp) { nctmp = nctmp.mount->mnt_ncmounton; if (nctmp.ncp == NULL) break; } if (nctmp.ncp == NULL) { if (curthread->td_proc) { kprintf("vfs_nlookup: '..' traverse broke " "jail: pid %d (%s)\n", curthread->td_proc->p_pid, curthread->td_comm); } nctmp = nd->nl_rootnch; } else { nctmp.ncp = nctmp.ncp->nc_parent; } cache_hold(&nctmp); cache_unlock(&nd->nl_nch); nd->nl_flags &= ~NLC_NCPISLOCKED; cache_get_maybe_shared(&nctmp, &nch, wantsexcllock(nd, ptr)); cache_drop(&nctmp); /* NOTE: zero's nctmp */ } wasdotordotdot = 2; } else { /* * Must unlock nl_nch when traversing down the path. However, * the child ncp has not yet been found/created and the parent's * child list might be empty. Thus releasing the lock can * allow a race whereby the parent ncp's vnode is recycled. * This case can occur especially when maxvnodes is set very low. * * We need the parent's ncp to remain resolved for all normal * filesystem activities, so we vhold() the vp during the lookup * to prevent recyclement due to vnlru / maxvnodes. * * If we race an unlink or rename the ncp might be marked * DESTROYED after resolution, requiring a retry. */ if ((hvp = nd->nl_nch.ncp->nc_vp) != NULL) vhold(hvp); cache_unlock(&nd->nl_nch); nd->nl_flags &= ~NLC_NCPISLOCKED; error = cache_nlookup_maybe_shared(&nd->nl_nch, &nlc, wantsexcllock(nd, ptr), &nch); if (error == EWOULDBLOCK) { nch = cache_nlookup(&nd->nl_nch, &nlc); if (nch.ncp->nc_flag & NCF_UNRESOLVED) hit = 0; for (;;) { error = cache_resolve(&nch, nd->nl_cred); if (error != EAGAIN && (nch.ncp->nc_flag & NCF_DESTROYED) == 0) { if (error == ESTALE) { if (!inretry) error = ENOENT; doretry = TRUE; } break; } kprintf("[diagnostic] nlookup: relookup %*.*s\n", nch.ncp->nc_nlen, nch.ncp->nc_nlen, nch.ncp->nc_name); cache_put(&nch); nch = cache_nlookup(&nd->nl_nch, &nlc); } } if (hvp) vdrop(hvp); wasdotordotdot = 0; } /* * If the last component was "." or ".." our dflags no longer * represents the parent directory and we have to explicitly * look it up. * * Expect the parent to be good since nch is locked. */ if (wasdotordotdot && error == 0) { dflags = 0; if ((par.ncp = nch.ncp->nc_parent) != NULL) { par.mount = nch.mount; cache_hold(&par); cache_lock_maybe_shared(&par, wantsexcllock(nd, ptr)); error = naccess(&par, 0, nd->nl_cred, &dflags); cache_put(&par); if (error) { if (!keeperror(nd, error)) error = 0; } } } /* * [end of subsection] * * nch is locked and referenced. * nd->nl_nch is unlocked and referenced. * * nl_nch must be unlocked or we could chain lock to the root * if a resolve gets stuck (e.g. in NFS). */ KKASSERT((nd->nl_flags & NLC_NCPISLOCKED) == 0); /* * Resolve the namespace if necessary. The ncp returned by * cache_nlookup() is referenced and locked. * * XXX neither '.' nor '..' should return EAGAIN since they were * previously resolved and thus cannot be newly created ncp's. */ if (nch.ncp->nc_flag & NCF_UNRESOLVED) { hit = 0; error = cache_resolve(&nch, nd->nl_cred); if (error == ESTALE) { if (!inretry) error = ENOENT; doretry = TRUE; } KKASSERT(error != EAGAIN); } else { error = nch.ncp->nc_error; } /* * Early completion. ENOENT is not an error if this is the last * component and NLC_CREATE or NLC_RENAME (rename target) was * requested. Note that ncp->nc_error is left as ENOENT in that * case, which we check later on. * * Also handle invalid '.' or '..' components terminating a path * for a create/rename/delete. The standard requires this and pax * pretty stupidly depends on it. */ if (islastelement(ptr)) { if (error == ENOENT && (nd->nl_flags & (NLC_CREATE | NLC_RENAME_DST)) ) { if (nd->nl_flags & NLC_NFS_RDONLY) { error = EROFS; } else { error = naccess(&nch, nd->nl_flags | dflags, nd->nl_cred, NULL); } } if (error == 0 && wasdotordotdot && (nd->nl_flags & (NLC_CREATE | NLC_DELETE | NLC_RENAME_SRC | NLC_RENAME_DST))) { /* * POSIX junk */ if (nd->nl_flags & NLC_CREATE) error = EEXIST; else if (nd->nl_flags & NLC_DELETE) error = (wasdotordotdot == 1) ? EINVAL : ENOTEMPTY; else error = EINVAL; } } /* * Early completion on error. */ if (error) { cache_put(&nch); break; } /* * If the element is a symlink and it is either not the last * element or it is the last element and we are allowed to * follow symlinks, resolve the symlink. */ if ((nch.ncp->nc_flag & NCF_ISSYMLINK) && (*ptr || (nd->nl_flags & NLC_FOLLOW)) ) { if (nd->nl_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; cache_put(&nch); break; } error = nreadsymlink(nd, &nch, &nlc); cache_put(&nch); if (error) break; /* * Concatenate trailing path elements onto the returned symlink. * Note that if the path component (ptr) is not exhausted, it * will being with a '/', so we do not have to add another one. * * The symlink may not be empty. */ len = strlen(ptr); if (nlc.nlc_namelen == 0 || nlc.nlc_namelen + len >= MAXPATHLEN) { error = nlc.nlc_namelen ? ENAMETOOLONG : ENOENT; objcache_put(namei_oc, nlc.nlc_nameptr); break; } bcopy(ptr, nlc.nlc_nameptr + nlc.nlc_namelen, len + 1); if (nd->nl_flags & NLC_HASBUF) objcache_put(namei_oc, nd->nl_path); nd->nl_path = nlc.nlc_nameptr; nd->nl_flags |= NLC_HASBUF; ptr = nd->nl_path; /* * Go back up to the top to resolve any initial '/'s in the * symlink. */ continue; } /* * If the element is a directory and we are crossing a mount point, * Locate the mount. */ while ((nch.ncp->nc_flag & NCF_ISMOUNTPT) && (nd->nl_flags & NLC_NOCROSSMOUNT) == 0 && (mp = cache_findmount(&nch)) != NULL ) { struct vnode *tdp; int vfs_do_busy = 0; /* * VFS must be busied before the namecache entry is locked, * but we don't want to waste time calling vfs_busy() if the * mount point is already resolved. */ again: cache_put(&nch); if (vfs_do_busy) { while (vfs_busy(mp, 0)) { if (mp->mnt_kern_flag & MNTK_UNMOUNT) { kprintf("nlookup: warning umount race avoided\n"); cache_dropmount(mp); error = EBUSY; vfs_do_busy = 0; goto double_break; } } } cache_get_maybe_shared(&mp->mnt_ncmountpt, &nch, wantsexcllock(nd, ptr)); if (nch.ncp->nc_flag & NCF_UNRESOLVED) { if (vfs_do_busy == 0) { vfs_do_busy = 1; goto again; } error = VFS_ROOT(mp, &tdp); vfs_unbusy(mp); vfs_do_busy = 0; if (keeperror(nd, error)) { cache_dropmount(mp); break; } if (error == 0) { cache_setvp(&nch, tdp); vput(tdp); } } if (vfs_do_busy) vfs_unbusy(mp); cache_dropmount(mp); } if (keeperror(nd, error)) { cache_put(&nch); double_break: break; } /* * Skip any slashes to get to the next element. If there * are any slashes at all the current element must be a * directory or, in the create case, intended to become a directory. * If it isn't we break without incrementing ptr and fall through * to the failure case below. */ while (*ptr == '/') { if ((nch.ncp->nc_flag & NCF_ISDIR) == 0 && !(nd->nl_flags & NLC_WILLBEDIR) ) { break; } ++ptr; } /* * Continuation case: additional elements and the current * element is a directory. */ if (*ptr && (nch.ncp->nc_flag & NCF_ISDIR)) { if (nd->nl_flags & NLC_NCDIR) { cache_drop_ncdir(&nd->nl_nch); nd->nl_flags &= ~NLC_NCDIR; } else { cache_drop(&nd->nl_nch); } cache_unlock(&nch); KKASSERT((nd->nl_flags & NLC_NCPISLOCKED) == 0); nd->nl_nch = nch; continue; } /* * Failure case: additional elements and the current element * is not a directory */ if (*ptr) { cache_put(&nch); error = ENOTDIR; break; } /* * Successful lookup of last element. * * Check permissions if the target exists. If the target does not * exist directory permissions were already tested in the early * completion code above. * * nd->nl_flags will be adjusted on return with NLC_APPENDONLY * if the file is marked append-only, and NLC_STICKY if the directory * containing the file is sticky. */ if (nch.ncp->nc_vp && (nd->nl_flags & NLC_ALLCHKS)) { error = naccess(&nch, nd->nl_flags | dflags, nd->nl_cred, NULL); if (keeperror(nd, error)) { cache_put(&nch); break; } } /* * Termination: no more elements. * * If NLC_REFDVP is set acquire a referenced parent dvp. */ if (nd->nl_flags & NLC_REFDVP) { cache_lock(&nd->nl_nch); error = cache_vref(&nd->nl_nch, nd->nl_cred, &nd->nl_dvp); cache_unlock(&nd->nl_nch); if (keeperror(nd, error)) { kprintf("NLC_REFDVP: Cannot ref dvp of %p\n", nch.ncp); cache_put(&nch); break; } } if (nd->nl_flags & NLC_NCDIR) { cache_drop_ncdir(&nd->nl_nch); nd->nl_flags &= ~NLC_NCDIR; } else { cache_drop(&nd->nl_nch); } nd->nl_nch = nch; nd->nl_flags |= NLC_NCPISLOCKED; error = 0; break; } if (hit) ++gd->gd_nchstats->ncs_longhits; else ++gd->gd_nchstats->ncs_longmiss; if (nd->nl_flags & NLC_NCPISLOCKED) KKASSERT(cache_lockstatus(&nd->nl_nch) > 0); /* * Retry the whole thing if doretry flag is set, but only once. * autofs(5) may mount another filesystem under its root directory * while resolving a path. */ if (doretry && !inretry) { inretry = TRUE; nd->nl_flags &= NLC_NCDIR; nd->nl_flags |= saveflag; goto nlookup_start; } /* * NOTE: If NLC_CREATE was set the ncp may represent a negative hit * (ncp->nc_error will be ENOENT), but we will still return an error * code of 0. */ return(error); }
/* * vp is the current namei directory * ndp is the name to locate in that directory... */ static int fdesc_lookup(struct vop_lookup_args *ap) { struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; char *pname = cnp->cn_nameptr; struct thread *td = cnp->cn_thread; struct file *fp; struct fdesc_get_ino_args arg; cap_rights_t rights; int nlen = cnp->cn_namelen; u_int fd, fd1; int error; struct vnode *fvp; if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EROFS; goto bad; } if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; VREF(dvp); return (0); } if (VTOFDESC(dvp)->fd_type != Froot) { error = ENOTDIR; goto bad; } fd = 0; /* the only time a leading 0 is acceptable is if it's "0" */ if (*pname == '0' && nlen != 1) { error = ENOENT; goto bad; } while (nlen--) { if (*pname < '0' || *pname > '9') { error = ENOENT; goto bad; } fd1 = 10 * fd + *pname++ - '0'; if (fd1 < fd) { error = ENOENT; goto bad; } fd = fd1; } /* * No rights to check since 'fp' isn't actually used. */ if ((error = fget(td, fd, cap_rights_init(&rights), &fp)) != 0) goto bad; /* Check if we're looking up ourselves. */ if (VTOFDESC(dvp)->fd_ix == FD_DESC + fd) { /* * In case we're holding the last reference to the file, the dvp * will be re-acquired. */ vhold(dvp); VOP_UNLOCK(dvp, 0); fdrop(fp, td); /* Re-aquire the lock afterwards. */ vn_lock(dvp, LK_RETRY | LK_EXCLUSIVE); vdrop(dvp); fvp = dvp; if ((dvp->v_iflag & VI_DOOMED) != 0) error = ENOENT; } else { /* * Unlock our root node (dvp) when doing this, since we might * deadlock since the vnode might be locked by another thread * and the root vnode lock will be obtained afterwards (in case * we're looking up the fd of the root vnode), which will be the * opposite lock order. Vhold the root vnode first so we don't * lose it. */ arg.ftype = Fdesc; arg.fd_fd = fd; arg.ix = FD_DESC + fd; arg.fp = fp; arg.td = td; error = vn_vget_ino_gen(dvp, fdesc_get_ino_alloc, &arg, LK_EXCLUSIVE, &fvp); } if (error) goto bad; *vpp = fvp; return (0); bad: *vpp = NULL; return (error); }
/* * This is very similar to vmntvnodescan() but it only scans the * vnodes on the syncer list. VFS's which support faster VFS_SYNC * operations use the VISDIRTY flag on the vnode to ensure that vnodes * with dirty inodes are added to the syncer in addition to vnodes * with dirty buffers, and can use this function instead of nmntvnodescan(). * * This is important when a system has millions of vnodes. */ int vsyncscan( struct mount *mp, int vmsc_flags, int (*slowfunc)(struct mount *mp, struct vnode *vp, void *data), void *data ) { struct syncer_ctx *ctx; struct synclist *slp; struct vnode *vp; int b; int i; int lkflags; if (vmsc_flags & VMSC_NOWAIT) lkflags = LK_NOWAIT; else lkflags = 0; /* * Syncer list context. This API requires a dedicated syncer thread. * (MNTK_THR_SYNC). */ KKASSERT(mp->mnt_kern_flag & MNTK_THR_SYNC); ctx = mp->mnt_syncer_ctx; lwkt_gettoken(&ctx->sc_token); /* * Setup for loop. Allow races against the syncer thread but * require that the syncer thread no be lazy if we were told * not to be lazy. */ b = ctx->syncer_delayno & ctx->syncer_mask; i = b; if ((vmsc_flags & VMSC_NOWAIT) == 0) ++ctx->syncer_forced; do { slp = &ctx->syncer_workitem_pending[i]; while ((vp = LIST_FIRST(slp)) != NULL) { KKASSERT(vp->v_mount == mp); if (vmsc_flags & VMSC_GETVP) { if (vget(vp, LK_EXCLUSIVE | lkflags) == 0) { slowfunc(mp, vp, data); vput(vp); } } else if (vmsc_flags & VMSC_GETVX) { vx_get(vp); slowfunc(mp, vp, data); vx_put(vp); } else { vhold(vp); slowfunc(mp, vp, data); vdrop(vp); } if (LIST_FIRST(slp) == vp) vn_syncer_add(vp, -(i + syncdelay)); } i = (i + 1) & ctx->syncer_mask; } while (i != b); if ((vmsc_flags & VMSC_NOWAIT) == 0) --ctx->syncer_forced; lwkt_reltoken(&ctx->sc_token); return(0); }
/* * Get the vnode associated with the given inode, allocating the vnode if * necessary. The vnode will be returned exclusively locked. * * The caller must lock the inode (shared or exclusive). * * Great care must be taken to avoid deadlocks and vnode acquisition/reclaim * races. */ struct vnode * hammer2_igetv(hammer2_inode_t *ip, int *errorp) { hammer2_inode_data_t *ipdata; hammer2_pfsmount_t *pmp; struct vnode *vp; ccms_state_t ostate; pmp = ip->pmp; KKASSERT(pmp != NULL); *errorp = 0; ipdata = &ip->chain->data->ipdata; for (;;) { /* * Attempt to reuse an existing vnode assignment. It is * possible to race a reclaim so the vget() may fail. The * inode must be unlocked during the vget() to avoid a * deadlock against a reclaim. */ vp = ip->vp; if (vp) { /* * Inode must be unlocked during the vget() to avoid * possible deadlocks, but leave the ip ref intact. * * vnode is held to prevent destruction during the * vget(). The vget() can still fail if we lost * a reclaim race on the vnode. */ vhold(vp); ostate = hammer2_inode_lock_temp_release(ip); if (vget(vp, LK_EXCLUSIVE)) { vdrop(vp); hammer2_inode_lock_temp_restore(ip, ostate); continue; } hammer2_inode_lock_temp_restore(ip, ostate); vdrop(vp); /* vp still locked and ref from vget */ if (ip->vp != vp) { kprintf("hammer2: igetv race %p/%p\n", ip->vp, vp); vput(vp); continue; } *errorp = 0; break; } /* * No vnode exists, allocate a new vnode. Beware of * allocation races. This function will return an * exclusively locked and referenced vnode. */ *errorp = getnewvnode(VT_HAMMER2, pmp->mp, &vp, 0, 0); if (*errorp) { kprintf("hammer2: igetv getnewvnode failed %d\n", *errorp); vp = NULL; break; } /* * Lock the inode and check for an allocation race. */ ostate = hammer2_inode_lock_upgrade(ip); if (ip->vp != NULL) { vp->v_type = VBAD; vx_put(vp); hammer2_inode_lock_downgrade(ip, ostate); continue; } switch (ipdata->type) { case HAMMER2_OBJTYPE_DIRECTORY: vp->v_type = VDIR; break; case HAMMER2_OBJTYPE_REGFILE: vp->v_type = VREG; vinitvmio(vp, ipdata->size, HAMMER2_LBUFSIZE, (int)ipdata->size & HAMMER2_LBUFMASK); break; case HAMMER2_OBJTYPE_SOFTLINK: /* * XXX for now we are using the generic file_read * and file_write code so we need a buffer cache * association. */ vp->v_type = VLNK; vinitvmio(vp, ipdata->size, HAMMER2_LBUFSIZE, (int)ipdata->size & HAMMER2_LBUFMASK); break; case HAMMER2_OBJTYPE_CDEV: vp->v_type = VCHR; /* fall through */ case HAMMER2_OBJTYPE_BDEV: vp->v_ops = &pmp->mp->mnt_vn_spec_ops; if (ipdata->type != HAMMER2_OBJTYPE_CDEV) vp->v_type = VBLK; addaliasu(vp, ipdata->rmajor, ipdata->rminor); break; case HAMMER2_OBJTYPE_FIFO: vp->v_type = VFIFO; vp->v_ops = &pmp->mp->mnt_vn_fifo_ops; break; default: panic("hammer2: unhandled objtype %d", ipdata->type); break; } if (ip == pmp->iroot) vsetflags(vp, VROOT); vp->v_data = ip; ip->vp = vp; hammer2_inode_ref(ip); /* vp association */ hammer2_inode_lock_downgrade(ip, ostate); break; } /* * Return non-NULL vp and *errorp == 0, or NULL vp and *errorp != 0. */ if (hammer2_debug & 0x0002) { kprintf("igetv vp %p refs 0x%08x aux 0x%08x\n", vp, vp->v_refcnt, vp->v_auxrefs); } return (vp); }
/* * Convert a vnode to its component name */ static int pfs_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct pfs_vdata *pvd = vp->v_data; struct pfs_node *pd = pvd->pvd_pn; struct pfs_node *pn; struct mount *mp; char *buf = ap->a_buf; int *buflen = ap->a_buflen; char pidbuf[PFS_NAMELEN]; pid_t pid = pvd->pvd_pid; int len, i, error, locked; i = *buflen; error = 0; pfs_lock(pd); if (vp->v_type == VDIR && pd->pn_type == pfstype_root) { *dvp = vp; vhold(*dvp); pfs_unlock(pd); PFS_RETURN (0); } else if (vp->v_type == VDIR && pd->pn_type == pfstype_procdir) { len = snprintf(pidbuf, sizeof(pidbuf), "%d", pid); i -= len; if (i < 0) { error = ENOMEM; goto failed; } bcopy(pidbuf, buf + i, len); } else { len = strlen(pd->pn_name); i -= len; if (i < 0) { error = ENOMEM; goto failed; } bcopy(pd->pn_name, buf + i, len); } pn = pd->pn_parent; pfs_unlock(pd); mp = vp->v_mount; error = vfs_busy(mp, 0); if (error) return (error); /* * vp is held by caller. */ locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); error = pfs_vncache_alloc(mp, dvp, pn, pid); if (error) { vn_lock(vp, locked | LK_RETRY); vfs_unbusy(mp); PFS_RETURN(error); } *buflen = i; vhold(*dvp); vput(*dvp); vn_lock(vp, locked | LK_RETRY); vfs_unbusy(mp); PFS_RETURN (0); failed: pfs_unlock(pd); PFS_RETURN(error); }
static int tmpfs_lookup(struct vop_cachedlookup_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; int error; struct tmpfs_dirent *de; struct tmpfs_node *dnode; dnode = VP_TO_TMPFS_DIR(dvp); *vpp = NULLVP; /* Check accessibility of requested node as a first step. */ error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread); if (error != 0) goto out; /* We cannot be requesting the parent directory of the root node. */ MPASS(IMPLIES(dnode->tn_type == VDIR && dnode->tn_dir.tn_parent == dnode, !(cnp->cn_flags & ISDOTDOT))); TMPFS_ASSERT_LOCKED(dnode); if (dnode->tn_dir.tn_parent == NULL) { error = ENOENT; goto out; } if (cnp->cn_flags & ISDOTDOT) { int ltype = 0; ltype = VOP_ISLOCKED(dvp); vhold(dvp); VOP_UNLOCK(dvp, 0); /* Allocate a new vnode on the matching entry. */ error = tmpfs_alloc_vp(dvp->v_mount, dnode->tn_dir.tn_parent, cnp->cn_lkflags, vpp); vn_lock(dvp, ltype | LK_RETRY); vdrop(dvp); } else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { VREF(dvp); *vpp = dvp; error = 0; } else { de = tmpfs_dir_lookup(dnode, NULL, cnp); if (de != NULL && de->td_node == NULL) cnp->cn_flags |= ISWHITEOUT; if (de == NULL || de->td_node == NULL) { /* The entry was not found in the directory. * This is OK if we are creating or renaming an * entry and are working on the last component of * the path name. */ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == CREATE || \ cnp->cn_nameiop == RENAME || (cnp->cn_nameiop == DELETE && cnp->cn_flags & DOWHITEOUT && cnp->cn_flags & ISWHITEOUT))) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, cnp->cn_thread); if (error != 0) goto out; /* Keep the component name in the buffer for * future uses. */ cnp->cn_flags |= SAVENAME; error = EJUSTRETURN; } else error = ENOENT; } else { struct tmpfs_node *tnode; /* The entry was found, so get its associated * tmpfs_node. */ tnode = de->td_node; /* If we are not at the last path component and * found a non-directory or non-link entry (which * may itself be pointing to a directory), raise * an error. */ if ((tnode->tn_type != VDIR && tnode->tn_type != VLNK) && !(cnp->cn_flags & ISLASTCN)) { error = ENOTDIR; goto out; } /* If we are deleting or renaming the entry, keep * track of its tmpfs_dirent so that it can be * easily deleted later. */ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, cnp->cn_thread); if (error != 0) goto out; /* Allocate a new vnode on the matching entry. */ error = tmpfs_alloc_vp(dvp->v_mount, tnode, cnp->cn_lkflags, vpp); if (error != 0) goto out; if ((dnode->tn_mode & S_ISTXT) && VOP_ACCESS(dvp, VADMIN, cnp->cn_cred, cnp->cn_thread) && VOP_ACCESS(*vpp, VADMIN, cnp->cn_cred, cnp->cn_thread)) { error = EPERM; vput(*vpp); *vpp = NULL; goto out; } cnp->cn_flags |= SAVENAME; } else { error = tmpfs_alloc_vp(dvp->v_mount, tnode, cnp->cn_lkflags, vpp); } } } /* Store the result of this lookup in the cache. Avoid this if the * request was for creation, as it does not improve timings on * emprical tests. */ if ((cnp->cn_flags & MAKEENTRY) && cnp->cn_nameiop != CREATE) cache_enter(dvp, *vpp, cnp); out: /* If there were no errors, *vpp cannot be null and it must be * locked. */ MPASS(IFF(error == 0, *vpp != NULLVP && VOP_ISLOCKED(*vpp))); return error; }
int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { vm_prot_t prot; long ahead, behind; int alloc_req, era, faultcount, nera, reqpage, result; boolean_t growstack, is_first_object_locked, wired; int map_generation; vm_object_t next_object; vm_page_t marray[VM_FAULT_READ_MAX]; int hardfault; struct faultstate fs; struct vnode *vp; int locked, error; hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); fs.vp = NULL; faultcount = reqpage = 0; RetryFault:; /* * Find the backing store object and offset into it to begin the * search. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { if (growstack && result == KERN_INVALID_ADDRESS && map != kernel_map) { result = vm_map_growstack(curproc, vaddr); if (result != KERN_SUCCESS) return (KERN_FAILURE); growstack = FALSE; goto RetryFault; } return (result); } map_generation = fs.map->timestamp; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { panic("vm_fault: fault on nofault entry, addr: %lx", (u_long)vaddr); } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. This must be done after * obtaining the vnode lock in order to avoid possible deadlocks. */ VM_OBJECT_WLOCK(fs.first_object); vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.lookup_still_valid = TRUE; if (wired) fault_type = prot | (fault_type & VM_PROT_COPY); fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { /* * If the object is dead, we stop here */ if (fs.object->flags & OBJ_DEAD) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { /* * check for page-based copy on write. * We check fs.object == fs.first_object so * as to ensure the legacy COW mechanism is * used when the page in question is part of * a shadow object. Otherwise, vm_page_cowfault() * removes the page from the backing object, * which is not what we want. */ vm_page_lock(fs.m); if ((fs.m->cow) && (fault_type & VM_PROT_WRITE) && (fs.object == fs.first_object)) { vm_page_cowfault(fs.m); unlock_and_deallocate(&fs); goto RetryFault; } /* * Wait/Retry if the page is busy. We have to do this * if the page is busy via either VPO_BUSY or * vm_page_t->busy because the vm_pager may be using * vm_page_t->busy for pageouts ( and even pageins if * it is the vnode pager ), and we could end up trying * to pagein and pageout the same page simultaneously. * * We can theoretically allow the busy case on a read * fault if the page is marked valid, but since such * pages are typically already pmap'd, putting that * special case in might be more effort then it is * worth. We cannot under any circumstances mess * around with a vm_page_t->busy page except, perhaps, * to pmap it. */ if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs.m, PGA_REFERENCED); vm_page_unlock(fs.m); if (fs.object != fs.first_object) { if (!VM_OBJECT_TRYWLOCK( fs.first_object)) { VM_OBJECT_WUNLOCK(fs.object); VM_OBJECT_WLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.object); } vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); vm_object_pip_wakeup(fs.first_object); VM_OBJECT_WUNLOCK(fs.first_object); fs.first_m = NULL; } unlock_map(&fs); if (fs.m == vm_page_lookup(fs.object, fs.pindex)) { vm_page_sleep_if_busy(fs.m, TRUE, "vmpfw"); } vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); PCPU_INC(cnt.v_intrans); vm_object_deallocate(fs.first_object); goto RetryFault; } vm_page_remque(fs.m); vm_page_unlock(fs.m); /* * Mark page busy for other processes, and the * pagedaemon. If it still isn't completely valid * (readable), jump to readrest, else break-out ( we * found the page ). */ vm_page_busy(fs.m); if (fs.m->valid != VM_PAGE_BITS_ALL) goto readrest; break; } /* * Page is not resident, If this is the search termination * or the pager might contain the page, allocate a new page. */ if (TRYPAGER || fs.object == fs.first_object) { if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At * worst, the P_KILLED might be not observed * there, and allocation can fail, causing * restart and new reading of the p_flag. */ fs.m = NULL; if (!vm_page_count_severe() || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 if ((fs.object->flags & OBJ_COLORED) == 0) { fs.object->flags |= OBJ_COLORED; fs.object->pg_color = atop(vaddr) - fs.pindex; } #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs.object->type != OBJT_VNODE && fs.object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; fs.m = vm_page_alloc(fs.object, fs.pindex, alloc_req); } if (fs.m == NULL) { unlock_and_deallocate(&fs); VM_WAITPFAULT; goto RetryFault; } else if (fs.m->valid == VM_PAGE_BITS_ALL) break; } readrest: /* * We have found a valid page or we have allocated a new page. * The page thus may not be valid or may not be entirely * valid. * * Attempt to fault-in the page if there is a chance that the * pager has it, and potentially fault in additional pages * at the same time. */ if (TRYPAGER) { int rv; u_char behavior = vm_map_entry_behavior(fs.entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { behind = 0; ahead = atop(fs.entry->end - vaddr) - 1; if (ahead > VM_FAULT_READ_AHEAD_MAX) ahead = VM_FAULT_READ_AHEAD_MAX; if (fs.pindex == fs.entry->next_read) vm_fault_cache_behind(&fs, VM_FAULT_READ_MAX); } else { /* * If this is a sequential page fault, then * arithmetically increase the number of pages * in the read-ahead window. Otherwise, reset * the read-ahead window to its smallest size. */ behind = atop(vaddr - fs.entry->start); if (behind > VM_FAULT_READ_BEHIND) behind = VM_FAULT_READ_BEHIND; ahead = atop(fs.entry->end - vaddr) - 1; era = fs.entry->read_ahead; if (fs.pindex == fs.entry->next_read) { nera = era + behind; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; behind = 0; if (ahead > nera) ahead = nera; if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_cache_behind(&fs, VM_FAULT_CACHE_BEHIND); } else if (ahead > VM_FAULT_READ_AHEAD_MIN) ahead = VM_FAULT_READ_AHEAD_MIN; if (era != ahead) fs.entry->read_ahead = ahead; } /* * Call the pager to retrieve the data, if any, after * releasing the lock on the map. We hold a ref on * fs.object and the pages are VPO_BUSY'd. */ unlock_map(&fs); if (fs.object->type == OBJT_VNODE) { vp = fs.object->handle; if (vp == fs.vp) goto vnode_locked; else if (fs.vp != NULL) { vput(fs.vp); fs.vp = NULL; } locked = VOP_ISLOCKED(vp); if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* Do not sleep for vnode lock while fs.m is busy */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error != 0) { vhold(vp); release_page(&fs); unlock_and_deallocate(&fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); vdrop(vp); fs.vp = vp; KASSERT(error == 0, ("vm_fault: vget failed")); goto RetryFault; } fs.vp = vp; } vnode_locked: KASSERT(fs.vp == NULL || !fs.map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * now we find out if any other pages should be paged * in at this time this routine checks to see if the * pages surrounding this fault reside in the same * object as the page for this fault. If they do, * then they are faulted in also into the object. The * array "marray" returned contains an array of * vm_page_t structs where one of them is the * vm_page_t passed to the routine. The reqpage * return value is the index into the marray for the * vm_page_t passed to the routine. * * fs.m plus the additional pages are VPO_BUSY'd. */ faultcount = vm_fault_additional_pages( fs.m, behind, ahead, marray, &reqpage); rv = faultcount ? vm_pager_get_pages(fs.object, marray, faultcount, reqpage) : VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* * Found the page. Leave it busy while we play * with it. */ /* * Relookup in case pager changed page. Pager * is responsible for disposition of old page * if moved. */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (!fs.m) { unlock_and_deallocate(&fs); goto RetryFault; } hardfault++; break; /* break to PAGE HAS BEEN FOUND */ } /* * Remove the bogus page (which does not exist at this * object/offset); before doing so, we must get back * our object lock to preserve our invariant. * * Also wake up any other process that may want to bring * in this page. * * If this is the top-level object, we must leave the * busy page to prevent another process from rushing * past us, and inserting the page in that object at * the same time that we are. */ if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * Data outside the range of the pager or an I/O error */ /* * XXX - the check for kernel_map is a kludge to work * around having the machine panic on a kernel space * fault w/ I/O error. */ if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } if (fs.object != fs.first_object) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; /* * XXX - we cannot just fall out at this * point, m has been freed and is invalid! */ } } /* * We get here if the object has default pager (or unwiring) * or the pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); next_object = fs.object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; VM_OBJECT_WLOCK(fs.object); } fs.first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs.m->flags & PG_ZERO) == 0) { pmap_zero_page(fs.m); } else { PCPU_INC(cnt.v_ozfod); } PCPU_INC(cnt.v_zfod); fs.m->valid = VM_PAGE_BITS_ALL; break; /* break to PAGE HAS BEEN FOUND */ } else { KASSERT(fs.object != next_object, ("object loop %p", next_object)); VM_OBJECT_WLOCK(next_object); vm_object_pip_add(next_object, 1); if (fs.object != fs.first_object) vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = next_object; } } KASSERT((fs.m->oflags & VPO_BUSY) != 0, ("vm_fault: not busy after main loop")); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { /* * This allows pages to be virtually copied from a * backing_object into the first_object, where the * backing object has no other refs to it, and cannot * gain any more refs. Instead of a bcopy, we just * move the page from the backing object to the * first object. Note that we must mark the page * dirty in the first object so that it will go out * to swap when needed. */ is_first_object_locked = FALSE; if ( /* * Only one shadow object */ (fs.object->shadow_count == 1) && /* * No COW refs, except us */ (fs.object->ref_count == 1) && /* * No one else can look this object up */ (fs.object->handle == NULL) && /* * No other ways to look the object up */ ((fs.object->type == OBJT_DEFAULT) || (fs.object->type == OBJT_SWAP)) && (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) && /* * We don't chase down the shadow chain */ fs.object == fs.first_object->backing_object) { /* * get rid of the unnecessary page */ vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); /* * grab the page and put it into the * process'es object. The page is * automatically made dirty. */ vm_page_lock(fs.m); vm_page_rename(fs.m, fs.first_object, fs.first_pindex); vm_page_unlock(fs.m); vm_page_busy(fs.m); fs.first_m = fs.m; fs.m = NULL; PCPU_INC(cnt.v_cow_optim); } else { /* * Oh, well, lets copy it. */ pmap_copy_page(fs.m, fs.first_m); fs.first_m->valid = VM_PAGE_BITS_ALL; if (wired && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) { vm_page_lock(fs.first_m); vm_page_wire(fs.first_m); vm_page_unlock(fs.first_m); vm_page_lock(fs.m); vm_page_unwire(fs.m, FALSE); vm_page_unlock(fs.m); } /* * We no longer need the old page or object. */ release_page(&fs); } /* * fs.object != fs.first_object due to above * conditional */ vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); /* * Only use the new page below... */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; if (!is_first_object_locked) VM_OBJECT_WLOCK(fs.object); PCPU_INC(cnt.v_cow_faults); curthread->td_cow++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } fs.lookup_still_valid = TRUE; if (fs.map->timestamp != map_generation) { result = vm_map_lookup_locked(&fs.map, vaddr, fault_type, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); /* * If we don't need the page any longer, put it on the inactive * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { release_page(&fs); unlock_and_deallocate(&fs); /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) goto RetryFault; return (result); } if ((retry_object != fs.first_object) || (retry_pindex != fs.first_pindex)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; } } /* * If the page was filled by a pager, update the map entry's * last read offset. Since the pager does not return the * actual set of pages that it read, this update is based on * the requested set. Typically, the requested and actual * sets are the same. * * XXX The following assignment modifies the map * without holding a write lock on it. */ if (hardfault) fs.entry->next_read = fs.pindex + faultcount - reqpage; if ((prot & VM_PROT_WRITE) != 0 || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_object_set_writeable_dirty(fs.object); /* * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if (fs.entry->eflags & MAP_ENTRY_NOSYNC) { if (fs.m->dirty == 0) fs.m->oflags |= VPO_NOSYNC; } else { fs.m->oflags &= ~VPO_NOSYNC; } /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also tell the backing pager, if any, that it should remove * any swap backing since the page is now dirty. */ if (((fault_type & VM_PROT_WRITE) != 0 && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_page_dirty(fs.m); vm_pager_page_unswapped(fs.m); } } /* * Page had better still be busy */ KASSERT(fs.m->oflags & VPO_BUSY, ("vm_fault: page %p not busy!", fs.m)); /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ KASSERT(fs.m->valid == VM_PAGE_BITS_ALL, ("vm_fault: page %p partially invalid", fs.m)); VM_OBJECT_WUNLOCK(fs.object); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired); if ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 && wired == 0) vm_fault_prefault(fs.map->pmap, vaddr, fs.entry); VM_OBJECT_WLOCK(fs.object); vm_page_lock(fs.m); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if (fault_flags & VM_FAULT_CHANGE_WIRING) { if (wired) vm_page_wire(fs.m); else vm_page_unwire(fs.m, 1); } else vm_page_activate(fs.m); if (m_hold != NULL) { *m_hold = fs.m; vm_page_hold(fs.m); } vm_page_unlock(fs.m); vm_page_wakeup(fs.m); /* * Unlock everything, and return */ unlock_and_deallocate(&fs); if (hardfault) { PCPU_INC(cnt.v_io_faults); curthread->td_ru.ru_majflt++; } else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); }
/* * We have to carry on the locking protocol on the null layer vnodes * as we progress through the tree. We also have to enforce read-only * if this layer is mounted read-only. */ static int null_lookup(struct vop_lookup_args *ap) { struct componentname *cnp = ap->a_cnp; struct vnode *dvp = ap->a_dvp; int flags = cnp->cn_flags; struct vnode *vp, *ldvp, *lvp; struct mount *mp; int error; mp = dvp->v_mount; if ((flags & ISLASTCN) != 0 && (mp->mnt_flag & MNT_RDONLY) != 0 && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); /* * Although it is possible to call null_bypass(), we'll do * a direct call to reduce overhead */ ldvp = NULLVPTOLOWERVP(dvp); vp = lvp = NULL; KASSERT((ldvp->v_vflag & VV_ROOT) == 0 || ((dvp->v_vflag & VV_ROOT) != 0 && (flags & ISDOTDOT) == 0), ("ldvp %p fl %#x dvp %p fl %#x flags %#x", ldvp, ldvp->v_vflag, dvp, dvp->v_vflag, flags)); /* * Hold ldvp. The reference on it, owned by dvp, is lost in * case of dvp reclamation, and we need ldvp to move our lock * from ldvp to dvp. */ vhold(ldvp); error = VOP_LOOKUP(ldvp, &lvp, cnp); /* * VOP_LOOKUP() on lower vnode may unlock ldvp, which allows * dvp to be reclaimed due to shared v_vnlock. Check for the * doomed state and return error. */ if ((error == 0 || error == EJUSTRETURN) && (dvp->v_iflag & VI_DOOMED) != 0) { error = ENOENT; if (lvp != NULL) vput(lvp); /* * If vgone() did reclaimed dvp before curthread * relocked ldvp, the locks of dvp and ldpv are no * longer shared. In this case, relock of ldvp in * lower fs VOP_LOOKUP() does not restore the locking * state of dvp. Compensate for this by unlocking * ldvp and locking dvp, which is also correct if the * locks are still shared. */ VOP_UNLOCK(ldvp, 0); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); } vdrop(ldvp); if (error == EJUSTRETURN && (flags & ISLASTCN) != 0 && (mp->mnt_flag & MNT_RDONLY) != 0 && (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME)) error = EROFS; if ((error == 0 || error == EJUSTRETURN) && lvp != NULL) { if (ldvp == lvp) { *ap->a_vpp = dvp; VREF(dvp); vrele(lvp); } else { error = null_nodeget(mp, lvp, &vp); if (error == 0) *ap->a_vpp = vp; } } return (error); }