/* * Filler function for proc/pid/self */ int procfs_doprocfile(PFS_FILL_ARGS) { char *fullpath; char *freepath; struct vnode *textvp; int error; freepath = NULL; PROC_LOCK(p); textvp = p->p_textvp; vhold(textvp); PROC_UNLOCK(p); error = vn_fullpath(td, textvp, &fullpath, &freepath); vdrop(textvp); if (error == 0) sbuf_printf(sb, "%s", fullpath); if (freepath != NULL) free(freepath, M_TEMP); return (error); }
/* ARGSUSED */ static int null_getwritemount(struct vop_getwritemount_args *ap) { struct null_node *xp; struct vnode *lowervp; struct vnode *vp; vp = ap->a_vp; VI_LOCK(vp); xp = VTONULL(vp); if (xp && (lowervp = xp->null_lowervp)) { VI_LOCK_FLAGS(lowervp, MTX_DUPOK); VI_UNLOCK(vp); vholdl(lowervp); VI_UNLOCK(lowervp); VOP_GETWRITEMOUNT(lowervp, ap->a_mpp); vdrop(lowervp); } else { VI_UNLOCK(vp); *(ap->a_mpp) = NULL; } return (0); }
/* FILL */ vfill() { int msg; int i; switch(object) { case BOTTLE: if (liq() != 0) msg = 105; else if (liqloc(loc) == 0) msg = 106; else { prop[BOTTLE] = cond[loc] & WATOIL; i = liq(); if (toting(BOTTLE)) place[i] = -1; msg = (i == OIL ? 108 : 107); } break; case VASE: if (liqloc(loc) == 0) { msg = 144; break; } if (!toting(VASE)) { msg = 29; break; } rspeak(145); vdrop(); return; default: msg = 29; } rspeak(msg); }
/* * Purge the cache of dead entries * * This is extremely inefficient due to the fact that vgone() not only * indirectly modifies the vnode cache, but may also sleep. We can * neither hold pfs_vncache_mutex across a vgone() call, nor make any * assumptions about the state of the cache after vgone() returns. In * consequence, we must start over after every vgone() call, and keep * trying until we manage to traverse the entire cache. * * The only way to improve this situation is to change the data structure * used to implement the cache. */ static void pfs_purge_locked(struct pfs_node *pn) { struct pfs_vdata *pvd; struct vnode *vnp; mtx_assert(&pfs_vncache_mutex, MA_OWNED); pvd = pfs_vncache; while (pvd != NULL) { if (pvd->pvd_dead || (pn != NULL && pvd->pvd_pn == pn)) { vnp = pvd->pvd_vnode; vhold(vnp); mtx_unlock(&pfs_vncache_mutex); VOP_LOCK(vnp, LK_EXCLUSIVE); vgone(vnp); VOP_UNLOCK(vnp, 0); mtx_lock(&pfs_vncache_mutex); vdrop(vnp); pvd = pfs_vncache; } else { pvd = pvd->pvd_next; } } }
int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { vm_prot_t prot; long ahead, behind; int alloc_req, era, faultcount, nera, reqpage, result; boolean_t growstack, is_first_object_locked, wired; int map_generation; vm_object_t next_object; vm_page_t marray[VM_FAULT_READ_MAX]; int hardfault; struct faultstate fs; struct vnode *vp; int locked, error; hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); fs.vp = NULL; faultcount = reqpage = 0; RetryFault:; /* * Find the backing store object and offset into it to begin the * search. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { if (growstack && result == KERN_INVALID_ADDRESS && map != kernel_map) { result = vm_map_growstack(curproc, vaddr); if (result != KERN_SUCCESS) return (KERN_FAILURE); growstack = FALSE; goto RetryFault; } return (result); } map_generation = fs.map->timestamp; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { panic("vm_fault: fault on nofault entry, addr: %lx", (u_long)vaddr); } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. This must be done after * obtaining the vnode lock in order to avoid possible deadlocks. */ VM_OBJECT_WLOCK(fs.first_object); vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.lookup_still_valid = TRUE; if (wired) fault_type = prot | (fault_type & VM_PROT_COPY); fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { /* * If the object is dead, we stop here */ if (fs.object->flags & OBJ_DEAD) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { /* * check for page-based copy on write. * We check fs.object == fs.first_object so * as to ensure the legacy COW mechanism is * used when the page in question is part of * a shadow object. Otherwise, vm_page_cowfault() * removes the page from the backing object, * which is not what we want. */ vm_page_lock(fs.m); if ((fs.m->cow) && (fault_type & VM_PROT_WRITE) && (fs.object == fs.first_object)) { vm_page_cowfault(fs.m); unlock_and_deallocate(&fs); goto RetryFault; } /* * Wait/Retry if the page is busy. We have to do this * if the page is busy via either VPO_BUSY or * vm_page_t->busy because the vm_pager may be using * vm_page_t->busy for pageouts ( and even pageins if * it is the vnode pager ), and we could end up trying * to pagein and pageout the same page simultaneously. * * We can theoretically allow the busy case on a read * fault if the page is marked valid, but since such * pages are typically already pmap'd, putting that * special case in might be more effort then it is * worth. We cannot under any circumstances mess * around with a vm_page_t->busy page except, perhaps, * to pmap it. */ if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs.m, PGA_REFERENCED); vm_page_unlock(fs.m); if (fs.object != fs.first_object) { if (!VM_OBJECT_TRYWLOCK( fs.first_object)) { VM_OBJECT_WUNLOCK(fs.object); VM_OBJECT_WLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.object); } vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); vm_object_pip_wakeup(fs.first_object); VM_OBJECT_WUNLOCK(fs.first_object); fs.first_m = NULL; } unlock_map(&fs); if (fs.m == vm_page_lookup(fs.object, fs.pindex)) { vm_page_sleep_if_busy(fs.m, TRUE, "vmpfw"); } vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); PCPU_INC(cnt.v_intrans); vm_object_deallocate(fs.first_object); goto RetryFault; } vm_page_remque(fs.m); vm_page_unlock(fs.m); /* * Mark page busy for other processes, and the * pagedaemon. If it still isn't completely valid * (readable), jump to readrest, else break-out ( we * found the page ). */ vm_page_busy(fs.m); if (fs.m->valid != VM_PAGE_BITS_ALL) goto readrest; break; } /* * Page is not resident, If this is the search termination * or the pager might contain the page, allocate a new page. */ if (TRYPAGER || fs.object == fs.first_object) { if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At * worst, the P_KILLED might be not observed * there, and allocation can fail, causing * restart and new reading of the p_flag. */ fs.m = NULL; if (!vm_page_count_severe() || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 if ((fs.object->flags & OBJ_COLORED) == 0) { fs.object->flags |= OBJ_COLORED; fs.object->pg_color = atop(vaddr) - fs.pindex; } #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs.object->type != OBJT_VNODE && fs.object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; fs.m = vm_page_alloc(fs.object, fs.pindex, alloc_req); } if (fs.m == NULL) { unlock_and_deallocate(&fs); VM_WAITPFAULT; goto RetryFault; } else if (fs.m->valid == VM_PAGE_BITS_ALL) break; } readrest: /* * We have found a valid page or we have allocated a new page. * The page thus may not be valid or may not be entirely * valid. * * Attempt to fault-in the page if there is a chance that the * pager has it, and potentially fault in additional pages * at the same time. */ if (TRYPAGER) { int rv; u_char behavior = vm_map_entry_behavior(fs.entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { behind = 0; ahead = atop(fs.entry->end - vaddr) - 1; if (ahead > VM_FAULT_READ_AHEAD_MAX) ahead = VM_FAULT_READ_AHEAD_MAX; if (fs.pindex == fs.entry->next_read) vm_fault_cache_behind(&fs, VM_FAULT_READ_MAX); } else { /* * If this is a sequential page fault, then * arithmetically increase the number of pages * in the read-ahead window. Otherwise, reset * the read-ahead window to its smallest size. */ behind = atop(vaddr - fs.entry->start); if (behind > VM_FAULT_READ_BEHIND) behind = VM_FAULT_READ_BEHIND; ahead = atop(fs.entry->end - vaddr) - 1; era = fs.entry->read_ahead; if (fs.pindex == fs.entry->next_read) { nera = era + behind; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; behind = 0; if (ahead > nera) ahead = nera; if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_cache_behind(&fs, VM_FAULT_CACHE_BEHIND); } else if (ahead > VM_FAULT_READ_AHEAD_MIN) ahead = VM_FAULT_READ_AHEAD_MIN; if (era != ahead) fs.entry->read_ahead = ahead; } /* * Call the pager to retrieve the data, if any, after * releasing the lock on the map. We hold a ref on * fs.object and the pages are VPO_BUSY'd. */ unlock_map(&fs); if (fs.object->type == OBJT_VNODE) { vp = fs.object->handle; if (vp == fs.vp) goto vnode_locked; else if (fs.vp != NULL) { vput(fs.vp); fs.vp = NULL; } locked = VOP_ISLOCKED(vp); if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* Do not sleep for vnode lock while fs.m is busy */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error != 0) { vhold(vp); release_page(&fs); unlock_and_deallocate(&fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); vdrop(vp); fs.vp = vp; KASSERT(error == 0, ("vm_fault: vget failed")); goto RetryFault; } fs.vp = vp; } vnode_locked: KASSERT(fs.vp == NULL || !fs.map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * now we find out if any other pages should be paged * in at this time this routine checks to see if the * pages surrounding this fault reside in the same * object as the page for this fault. If they do, * then they are faulted in also into the object. The * array "marray" returned contains an array of * vm_page_t structs where one of them is the * vm_page_t passed to the routine. The reqpage * return value is the index into the marray for the * vm_page_t passed to the routine. * * fs.m plus the additional pages are VPO_BUSY'd. */ faultcount = vm_fault_additional_pages( fs.m, behind, ahead, marray, &reqpage); rv = faultcount ? vm_pager_get_pages(fs.object, marray, faultcount, reqpage) : VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* * Found the page. Leave it busy while we play * with it. */ /* * Relookup in case pager changed page. Pager * is responsible for disposition of old page * if moved. */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (!fs.m) { unlock_and_deallocate(&fs); goto RetryFault; } hardfault++; break; /* break to PAGE HAS BEEN FOUND */ } /* * Remove the bogus page (which does not exist at this * object/offset); before doing so, we must get back * our object lock to preserve our invariant. * * Also wake up any other process that may want to bring * in this page. * * If this is the top-level object, we must leave the * busy page to prevent another process from rushing * past us, and inserting the page in that object at * the same time that we are. */ if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * Data outside the range of the pager or an I/O error */ /* * XXX - the check for kernel_map is a kludge to work * around having the machine panic on a kernel space * fault w/ I/O error. */ if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } if (fs.object != fs.first_object) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; /* * XXX - we cannot just fall out at this * point, m has been freed and is invalid! */ } } /* * We get here if the object has default pager (or unwiring) * or the pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); next_object = fs.object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; VM_OBJECT_WLOCK(fs.object); } fs.first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs.m->flags & PG_ZERO) == 0) { pmap_zero_page(fs.m); } else { PCPU_INC(cnt.v_ozfod); } PCPU_INC(cnt.v_zfod); fs.m->valid = VM_PAGE_BITS_ALL; break; /* break to PAGE HAS BEEN FOUND */ } else { KASSERT(fs.object != next_object, ("object loop %p", next_object)); VM_OBJECT_WLOCK(next_object); vm_object_pip_add(next_object, 1); if (fs.object != fs.first_object) vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = next_object; } } KASSERT((fs.m->oflags & VPO_BUSY) != 0, ("vm_fault: not busy after main loop")); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { /* * This allows pages to be virtually copied from a * backing_object into the first_object, where the * backing object has no other refs to it, and cannot * gain any more refs. Instead of a bcopy, we just * move the page from the backing object to the * first object. Note that we must mark the page * dirty in the first object so that it will go out * to swap when needed. */ is_first_object_locked = FALSE; if ( /* * Only one shadow object */ (fs.object->shadow_count == 1) && /* * No COW refs, except us */ (fs.object->ref_count == 1) && /* * No one else can look this object up */ (fs.object->handle == NULL) && /* * No other ways to look the object up */ ((fs.object->type == OBJT_DEFAULT) || (fs.object->type == OBJT_SWAP)) && (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) && /* * We don't chase down the shadow chain */ fs.object == fs.first_object->backing_object) { /* * get rid of the unnecessary page */ vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); /* * grab the page and put it into the * process'es object. The page is * automatically made dirty. */ vm_page_lock(fs.m); vm_page_rename(fs.m, fs.first_object, fs.first_pindex); vm_page_unlock(fs.m); vm_page_busy(fs.m); fs.first_m = fs.m; fs.m = NULL; PCPU_INC(cnt.v_cow_optim); } else { /* * Oh, well, lets copy it. */ pmap_copy_page(fs.m, fs.first_m); fs.first_m->valid = VM_PAGE_BITS_ALL; if (wired && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) { vm_page_lock(fs.first_m); vm_page_wire(fs.first_m); vm_page_unlock(fs.first_m); vm_page_lock(fs.m); vm_page_unwire(fs.m, FALSE); vm_page_unlock(fs.m); } /* * We no longer need the old page or object. */ release_page(&fs); } /* * fs.object != fs.first_object due to above * conditional */ vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); /* * Only use the new page below... */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; if (!is_first_object_locked) VM_OBJECT_WLOCK(fs.object); PCPU_INC(cnt.v_cow_faults); curthread->td_cow++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } fs.lookup_still_valid = TRUE; if (fs.map->timestamp != map_generation) { result = vm_map_lookup_locked(&fs.map, vaddr, fault_type, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); /* * If we don't need the page any longer, put it on the inactive * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { release_page(&fs); unlock_and_deallocate(&fs); /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) goto RetryFault; return (result); } if ((retry_object != fs.first_object) || (retry_pindex != fs.first_pindex)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; } } /* * If the page was filled by a pager, update the map entry's * last read offset. Since the pager does not return the * actual set of pages that it read, this update is based on * the requested set. Typically, the requested and actual * sets are the same. * * XXX The following assignment modifies the map * without holding a write lock on it. */ if (hardfault) fs.entry->next_read = fs.pindex + faultcount - reqpage; if ((prot & VM_PROT_WRITE) != 0 || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_object_set_writeable_dirty(fs.object); /* * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if (fs.entry->eflags & MAP_ENTRY_NOSYNC) { if (fs.m->dirty == 0) fs.m->oflags |= VPO_NOSYNC; } else { fs.m->oflags &= ~VPO_NOSYNC; } /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also tell the backing pager, if any, that it should remove * any swap backing since the page is now dirty. */ if (((fault_type & VM_PROT_WRITE) != 0 && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_page_dirty(fs.m); vm_pager_page_unswapped(fs.m); } } /* * Page had better still be busy */ KASSERT(fs.m->oflags & VPO_BUSY, ("vm_fault: page %p not busy!", fs.m)); /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ KASSERT(fs.m->valid == VM_PAGE_BITS_ALL, ("vm_fault: page %p partially invalid", fs.m)); VM_OBJECT_WUNLOCK(fs.object); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired); if ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 && wired == 0) vm_fault_prefault(fs.map->pmap, vaddr, fs.entry); VM_OBJECT_WLOCK(fs.object); vm_page_lock(fs.m); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if (fault_flags & VM_FAULT_CHANGE_WIRING) { if (wired) vm_page_wire(fs.m); else vm_page_unwire(fs.m, 1); } else vm_page_activate(fs.m); if (m_hold != NULL) { *m_hold = fs.m; vm_page_hold(fs.m); } vm_page_unlock(fs.m); vm_page_wakeup(fs.m); /* * Unlock everything, and return */ unlock_and_deallocate(&fs); if (hardfault) { PCPU_INC(cnt.v_io_faults); curthread->td_ru.ru_majflt++; } else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); }
/* * We need to process our own vnode lock and then clear the * interlock flag as it applies only to our vnode, not the * vnodes below us on the stack. */ static int null_lock(struct vop_lock1_args *ap) { struct vnode *vp = ap->a_vp; int flags = ap->a_flags; struct null_node *nn; struct vnode *lvp; int error; if ((flags & LK_INTERLOCK) == 0) { VI_LOCK(vp); ap->a_flags = flags |= LK_INTERLOCK; } nn = VTONULL(vp); /* * If we're still active we must ask the lower layer to * lock as ffs has special lock considerations in it's * vop lock. */ if (nn != NULL && (lvp = NULLVPTOLOWERVP(vp)) != NULL) { VI_LOCK_FLAGS(lvp, MTX_DUPOK); VI_UNLOCK(vp); /* * We have to hold the vnode here to solve a potential * reclaim race. If we're forcibly vgone'd while we * still have refs, a thread could be sleeping inside * the lowervp's vop_lock routine. When we vgone we will * drop our last ref to the lowervp, which would allow it * to be reclaimed. The lowervp could then be recycled, * in which case it is not legal to be sleeping in it's VOP. * We prevent it from being recycled by holding the vnode * here. */ vholdl(lvp); error = VOP_LOCK(lvp, flags); /* * We might have slept to get the lock and someone might have * clean our vnode already, switching vnode lock from one in * lowervp to v_lock in our own vnode structure. Handle this * case by reacquiring correct lock in requested mode. */ if (VTONULL(vp) == NULL && error == 0) { ap->a_flags &= ~(LK_TYPE_MASK | LK_INTERLOCK); switch (flags & LK_TYPE_MASK) { case LK_SHARED: ap->a_flags |= LK_SHARED; break; case LK_UPGRADE: case LK_EXCLUSIVE: ap->a_flags |= LK_EXCLUSIVE; break; default: panic("Unsupported lock request %d\n", ap->a_flags); } VOP_UNLOCK(lvp, 0); error = vop_stdlock(ap); } vdrop(lvp); } else error = vop_stdlock(ap); return (error); }
/* * Get the vnode associated with the given inode, allocating the vnode if * necessary. The vnode will be returned exclusively locked. * * The caller must lock the inode (shared or exclusive). * * Great care must be taken to avoid deadlocks and vnode acquisition/reclaim * races. */ struct vnode * hammer2_igetv(hammer2_inode_t *ip, int *errorp) { hammer2_inode_data_t *ipdata; hammer2_pfsmount_t *pmp; struct vnode *vp; ccms_state_t ostate; pmp = ip->pmp; KKASSERT(pmp != NULL); *errorp = 0; ipdata = &ip->chain->data->ipdata; for (;;) { /* * Attempt to reuse an existing vnode assignment. It is * possible to race a reclaim so the vget() may fail. The * inode must be unlocked during the vget() to avoid a * deadlock against a reclaim. */ vp = ip->vp; if (vp) { /* * Inode must be unlocked during the vget() to avoid * possible deadlocks, but leave the ip ref intact. * * vnode is held to prevent destruction during the * vget(). The vget() can still fail if we lost * a reclaim race on the vnode. */ vhold(vp); ostate = hammer2_inode_lock_temp_release(ip); if (vget(vp, LK_EXCLUSIVE)) { vdrop(vp); hammer2_inode_lock_temp_restore(ip, ostate); continue; } hammer2_inode_lock_temp_restore(ip, ostate); vdrop(vp); /* vp still locked and ref from vget */ if (ip->vp != vp) { kprintf("hammer2: igetv race %p/%p\n", ip->vp, vp); vput(vp); continue; } *errorp = 0; break; } /* * No vnode exists, allocate a new vnode. Beware of * allocation races. This function will return an * exclusively locked and referenced vnode. */ *errorp = getnewvnode(VT_HAMMER2, pmp->mp, &vp, 0, 0); if (*errorp) { kprintf("hammer2: igetv getnewvnode failed %d\n", *errorp); vp = NULL; break; } /* * Lock the inode and check for an allocation race. */ ostate = hammer2_inode_lock_upgrade(ip); if (ip->vp != NULL) { vp->v_type = VBAD; vx_put(vp); hammer2_inode_lock_downgrade(ip, ostate); continue; } switch (ipdata->type) { case HAMMER2_OBJTYPE_DIRECTORY: vp->v_type = VDIR; break; case HAMMER2_OBJTYPE_REGFILE: vp->v_type = VREG; vinitvmio(vp, ipdata->size, HAMMER2_LBUFSIZE, (int)ipdata->size & HAMMER2_LBUFMASK); break; case HAMMER2_OBJTYPE_SOFTLINK: /* * XXX for now we are using the generic file_read * and file_write code so we need a buffer cache * association. */ vp->v_type = VLNK; vinitvmio(vp, ipdata->size, HAMMER2_LBUFSIZE, (int)ipdata->size & HAMMER2_LBUFMASK); break; case HAMMER2_OBJTYPE_CDEV: vp->v_type = VCHR; /* fall through */ case HAMMER2_OBJTYPE_BDEV: vp->v_ops = &pmp->mp->mnt_vn_spec_ops; if (ipdata->type != HAMMER2_OBJTYPE_CDEV) vp->v_type = VBLK; addaliasu(vp, ipdata->rmajor, ipdata->rminor); break; case HAMMER2_OBJTYPE_FIFO: vp->v_type = VFIFO; vp->v_ops = &pmp->mp->mnt_vn_fifo_ops; break; default: panic("hammer2: unhandled objtype %d", ipdata->type); break; } if (ip == pmp->iroot) vsetflags(vp, VROOT); vp->v_data = ip; ip->vp = vp; hammer2_inode_ref(ip); /* vp association */ hammer2_inode_lock_downgrade(ip, ostate); break; } /* * Return non-NULL vp and *errorp == 0, or NULL vp and *errorp != 0. */ if (hammer2_debug & 0x0002) { kprintf("igetv vp %p refs 0x%08x aux 0x%08x\n", vp, vp->v_refcnt, vp->v_auxrefs); } return (vp); }
/* * Read from a file */ static int pfs_read(struct vop_read_args *va) { struct vnode *vn = va->a_vp; struct pfs_vdata *pvd = vn->v_data; struct pfs_node *pn = pvd->pvd_pn; struct uio *uio = va->a_uio; struct proc *proc; struct sbuf *sb = NULL; int error, locked; unsigned int buflen, offset, resid; PFS_TRACE(("%s", pn->pn_name)); pfs_assert_not_owned(pn); if (vn->v_type != VREG) PFS_RETURN (EINVAL); KASSERT_PN_IS_FILE(pn); if (!(pn->pn_flags & PFS_RD)) PFS_RETURN (EBADF); if (pn->pn_fill == NULL) PFS_RETURN (EIO); /* * This is necessary because either process' privileges may * have changed since the open() call. */ if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc)) PFS_RETURN (EIO); if (proc != NULL) { _PHOLD(proc); PROC_UNLOCK(proc); } vhold(vn); locked = VOP_ISLOCKED(vn, curthread); VOP_UNLOCK(vn, 0, curthread); if (pn->pn_flags & PFS_RAWRD) { PFS_TRACE(("%lu resid", (unsigned long)uio->uio_resid)); error = pn_fill(curthread, proc, pn, NULL, uio); PFS_TRACE(("%lu resid", (unsigned long)uio->uio_resid)); goto ret; } /* beaucoup sanity checks so we don't ask for bogus allocation */ if (uio->uio_offset < 0 || uio->uio_resid < 0 || (offset = uio->uio_offset) != uio->uio_offset || (resid = uio->uio_resid) != uio->uio_resid || (buflen = offset + resid + 1) < offset || buflen > INT_MAX) { if (proc != NULL) PRELE(proc); error = EINVAL; goto ret; } if (buflen > MAXPHYS + 1) { error = EIO; goto ret; } sb = sbuf_new(sb, NULL, buflen, 0); if (sb == NULL) { error = EIO; goto ret; } error = pn_fill(curthread, proc, pn, sb, uio); if (error) { sbuf_delete(sb); goto ret; } sbuf_finish(sb); error = uiomove_frombuf(sbuf_data(sb), sbuf_len(sb), uio); sbuf_delete(sb); ret: vn_lock(vn, locked | LK_RETRY, curthread); vdrop(vn); if (proc != NULL) PRELE(proc); PFS_RETURN (error); }
/* THROW etc. */ vthrow() { int msg; int i; if (toting(ROD2) && object == ROD && !toting(ROD)) object = ROD2; if (!toting(object)) { actspk(verb); return; } /* treasure to troll */ if (at(TROLL) && object >= 50 && object<MAXOBJ) { rspeak(159); drop(object,0); move(TROLL,0); move((TROLL+MAXOBJ),0); drop(TROLL2,117); drop((TROLL2+MAXOBJ),122); juggle(CHASM); return; } /* feed the bears... */ if (object == FOOD && here(BEAR)) { object = BEAR; vfeed(); return; } /* if not axe, same as drop... */ if (object != AXE) { vdrop(); return; } /* AXE is THROWN */ /* at a dwarf... */ if (i = dcheck()) { msg = 48; if (pct(33)) { dseen[i] = dloc[i] = 0; msg = 47; ++dkill; if (dkill == 1) msg = 149; } } /* at a dragon... */ else if (at(DRAGON) && prop[DRAGON] == 0) msg = 152; /* at the troll... */ else if (at(TROLL)) msg = 158; /* at the bear... */ else if (here(BEAR) && prop[BEAR] == 0) { rspeak(164); drop(AXE,loc); fixed[AXE] = -1; prop[AXE] = 1; juggle(BEAR); return; } /* otherwise it is an attack */ else { verb = KILL; object = 0; itverb(); return; } /* handle the left over axe... */ rspeak(msg); drop(AXE,loc); describe(); }
/* Routine to process a transitive verb */ trverb() { switch(verb){ case CALM: case WALK: case QUIT: case SCORE: case FOO: case BRIEF: case SUSPEND: case HOURS: case LOG: actspk(verb); break; case TAKE: vtake(); break; case DROP: vdrop(); break; case OPEN: case LOCK: vopen(); break; case SAY: vsay(); break; case NOTHING: rspeak(54); break; case ON: von(); break; case OFF: voff(); break; case WAVE: vwave(); break; case KILL: vkill(); break; case POUR: vpour(); break; case EAT: veat(); break; case DRINK: vdrink(); break; case RUB: if (object != LAMP) rspeak(76); else actspk(RUB); break; case THROW: vthrow(); break; case FEED: vfeed(); break; case FIND: case INVENTORY: vfind(); break; case FILL: vfill(); break; case READ: vread(); break; case BLAST: vblast(); break; case BREAK: vbreak(); break; case WAKE: vwake(); break; default: printf("This verb is not implemented yet.\n"); } }
/* * vp is the current namei directory * ndp is the name to locate in that directory... */ static int fdesc_lookup(struct vop_lookup_args *ap) { struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; char *pname = cnp->cn_nameptr; struct thread *td = cnp->cn_thread; struct file *fp; struct fdesc_get_ino_args arg; cap_rights_t rights; int nlen = cnp->cn_namelen; u_int fd, fd1; int error; struct vnode *fvp; if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EROFS; goto bad; } if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; VREF(dvp); return (0); } if (VTOFDESC(dvp)->fd_type != Froot) { error = ENOTDIR; goto bad; } fd = 0; /* the only time a leading 0 is acceptable is if it's "0" */ if (*pname == '0' && nlen != 1) { error = ENOENT; goto bad; } while (nlen--) { if (*pname < '0' || *pname > '9') { error = ENOENT; goto bad; } fd1 = 10 * fd + *pname++ - '0'; if (fd1 < fd) { error = ENOENT; goto bad; } fd = fd1; } /* * No rights to check since 'fp' isn't actually used. */ if ((error = fget(td, fd, cap_rights_init(&rights), &fp)) != 0) goto bad; /* Check if we're looking up ourselves. */ if (VTOFDESC(dvp)->fd_ix == FD_DESC + fd) { /* * In case we're holding the last reference to the file, the dvp * will be re-acquired. */ vhold(dvp); VOP_UNLOCK(dvp, 0); fdrop(fp, td); /* Re-aquire the lock afterwards. */ vn_lock(dvp, LK_RETRY | LK_EXCLUSIVE); vdrop(dvp); fvp = dvp; if ((dvp->v_iflag & VI_DOOMED) != 0) error = ENOENT; } else { /* * Unlock our root node (dvp) when doing this, since we might * deadlock since the vnode might be locked by another thread * and the root vnode lock will be obtained afterwards (in case * we're looking up the fd of the root vnode), which will be the * opposite lock order. Vhold the root vnode first so we don't * lose it. */ arg.ftype = Fdesc; arg.fd_fd = fd; arg.ix = FD_DESC + fd; arg.fp = fp; arg.td = td; error = vn_vget_ino_gen(dvp, fdesc_get_ino_alloc, &arg, LK_EXCLUSIVE, &fvp); } if (error) goto bad; *vpp = fvp; return (0); bad: *vpp = NULL; return (error); }
/* * This is very similar to vmntvnodescan() but it only scans the * vnodes on the syncer list. VFS's which support faster VFS_SYNC * operations use the VISDIRTY flag on the vnode to ensure that vnodes * with dirty inodes are added to the syncer in addition to vnodes * with dirty buffers, and can use this function instead of nmntvnodescan(). * * This is important when a system has millions of vnodes. */ int vsyncscan( struct mount *mp, int vmsc_flags, int (*slowfunc)(struct mount *mp, struct vnode *vp, void *data), void *data ) { struct syncer_ctx *ctx; struct synclist *slp; struct vnode *vp; int b; int i; int lkflags; if (vmsc_flags & VMSC_NOWAIT) lkflags = LK_NOWAIT; else lkflags = 0; /* * Syncer list context. This API requires a dedicated syncer thread. * (MNTK_THR_SYNC). */ KKASSERT(mp->mnt_kern_flag & MNTK_THR_SYNC); ctx = mp->mnt_syncer_ctx; lwkt_gettoken(&ctx->sc_token); /* * Setup for loop. Allow races against the syncer thread but * require that the syncer thread no be lazy if we were told * not to be lazy. */ b = ctx->syncer_delayno & ctx->syncer_mask; i = b; if ((vmsc_flags & VMSC_NOWAIT) == 0) ++ctx->syncer_forced; do { slp = &ctx->syncer_workitem_pending[i]; while ((vp = LIST_FIRST(slp)) != NULL) { KKASSERT(vp->v_mount == mp); if (vmsc_flags & VMSC_GETVP) { if (vget(vp, LK_EXCLUSIVE | lkflags) == 0) { slowfunc(mp, vp, data); vput(vp); } } else if (vmsc_flags & VMSC_GETVX) { vx_get(vp); slowfunc(mp, vp, data); vx_put(vp); } else { vhold(vp); slowfunc(mp, vp, data); vdrop(vp); } if (LIST_FIRST(slp) == vp) vn_syncer_add(vp, -(i + syncdelay)); } i = (i + 1) & ctx->syncer_mask; } while (i != b); if ((vmsc_flags & VMSC_NOWAIT) == 0) --ctx->syncer_forced; lwkt_reltoken(&ctx->sc_token); return(0); }
static int tmpfs_lookup(struct vop_cachedlookup_args *v) { struct vnode *dvp = v->a_dvp; struct vnode **vpp = v->a_vpp; struct componentname *cnp = v->a_cnp; int error; struct tmpfs_dirent *de; struct tmpfs_node *dnode; dnode = VP_TO_TMPFS_DIR(dvp); *vpp = NULLVP; /* Check accessibility of requested node as a first step. */ error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread); if (error != 0) goto out; /* We cannot be requesting the parent directory of the root node. */ MPASS(IMPLIES(dnode->tn_type == VDIR && dnode->tn_dir.tn_parent == dnode, !(cnp->cn_flags & ISDOTDOT))); TMPFS_ASSERT_LOCKED(dnode); if (dnode->tn_dir.tn_parent == NULL) { error = ENOENT; goto out; } if (cnp->cn_flags & ISDOTDOT) { int ltype = 0; ltype = VOP_ISLOCKED(dvp); vhold(dvp); VOP_UNLOCK(dvp, 0); /* Allocate a new vnode on the matching entry. */ error = tmpfs_alloc_vp(dvp->v_mount, dnode->tn_dir.tn_parent, cnp->cn_lkflags, vpp); vn_lock(dvp, ltype | LK_RETRY); vdrop(dvp); } else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { VREF(dvp); *vpp = dvp; error = 0; } else { de = tmpfs_dir_lookup(dnode, NULL, cnp); if (de != NULL && de->td_node == NULL) cnp->cn_flags |= ISWHITEOUT; if (de == NULL || de->td_node == NULL) { /* The entry was not found in the directory. * This is OK if we are creating or renaming an * entry and are working on the last component of * the path name. */ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == CREATE || \ cnp->cn_nameiop == RENAME || (cnp->cn_nameiop == DELETE && cnp->cn_flags & DOWHITEOUT && cnp->cn_flags & ISWHITEOUT))) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, cnp->cn_thread); if (error != 0) goto out; /* Keep the component name in the buffer for * future uses. */ cnp->cn_flags |= SAVENAME; error = EJUSTRETURN; } else error = ENOENT; } else { struct tmpfs_node *tnode; /* The entry was found, so get its associated * tmpfs_node. */ tnode = de->td_node; /* If we are not at the last path component and * found a non-directory or non-link entry (which * may itself be pointing to a directory), raise * an error. */ if ((tnode->tn_type != VDIR && tnode->tn_type != VLNK) && !(cnp->cn_flags & ISLASTCN)) { error = ENOTDIR; goto out; } /* If we are deleting or renaming the entry, keep * track of its tmpfs_dirent so that it can be * easily deleted later. */ if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, cnp->cn_thread); if (error != 0) goto out; /* Allocate a new vnode on the matching entry. */ error = tmpfs_alloc_vp(dvp->v_mount, tnode, cnp->cn_lkflags, vpp); if (error != 0) goto out; if ((dnode->tn_mode & S_ISTXT) && VOP_ACCESS(dvp, VADMIN, cnp->cn_cred, cnp->cn_thread) && VOP_ACCESS(*vpp, VADMIN, cnp->cn_cred, cnp->cn_thread)) { error = EPERM; vput(*vpp); *vpp = NULL; goto out; } cnp->cn_flags |= SAVENAME; } else { error = tmpfs_alloc_vp(dvp->v_mount, tnode, cnp->cn_lkflags, vpp); } } } /* Store the result of this lookup in the cache. Avoid this if the * request was for creation, as it does not improve timings on * emprical tests. */ if ((cnp->cn_flags & MAKEENTRY) && cnp->cn_nameiop != CREATE) cache_enter(dvp, *vpp, cnp); out: /* If there were no errors, *vpp cannot be null and it must be * locked. */ MPASS(IFF(error == 0, *vpp != NULLVP && VOP_ISLOCKED(*vpp))); return error; }
/* * Do a generic nlookup. Note that the passed nd is not nlookup_done()'d * on return, even if an error occurs. If no error occurs or NLC_CREATE * is flagged and ENOENT is returned, then the returned nl_nch is always * referenced and locked exclusively. * * WARNING: For any general error other than ENOENT w/NLC_CREATE, the * the resulting nl_nch may or may not be locked and if locked * might be locked either shared or exclusive. * * Intermediate directory elements, including the current directory, require * execute (search) permission. nlookup does not examine the access * permissions on the returned element. * * If NLC_CREATE is set the last directory must allow node creation, * and an error code of 0 will be returned for a non-existant * target (not ENOENT). * * If NLC_RENAME_DST is set the last directory mut allow node deletion, * plus the sticky check is made, and an error code of 0 will be returned * for a non-existant target (not ENOENT). * * If NLC_DELETE is set the last directory mut allow node deletion, * plus the sticky check is made. * * If NLC_REFDVP is set nd->nl_dvp will be set to the directory vnode * of the returned entry. The vnode will be referenced, but not locked, * and will be released by nlookup_done() along with everything else. * * NOTE: As an optimization we attempt to obtain a shared namecache lock * on any intermediate elements. On success, the returned element * is ALWAYS locked exclusively. */ int nlookup(struct nlookupdata *nd) { globaldata_t gd = mycpu; struct nlcomponent nlc; struct nchandle nch; struct nchandle par; struct nchandle nctmp; struct mount *mp; struct vnode *hvp; /* hold to prevent recyclement */ int wasdotordotdot; char *ptr; char *nptr; int error; int len; int dflags; int hit = 1; int saveflag = nd->nl_flags & ~NLC_NCDIR; boolean_t doretry = FALSE; boolean_t inretry = FALSE; nlookup_start: #ifdef KTRACE if (KTRPOINT(nd->nl_td, KTR_NAMEI)) ktrnamei(nd->nl_td->td_lwp, nd->nl_path); #endif bzero(&nlc, sizeof(nlc)); /* * Setup for the loop. The current working namecache element is * always at least referenced. We lock it as required, but always * return a locked, resolved namecache entry. */ nd->nl_loopcnt = 0; if (nd->nl_dvp) { vrele(nd->nl_dvp); nd->nl_dvp = NULL; } ptr = nd->nl_path; /* * Loop on the path components. At the top of the loop nd->nl_nch * is ref'd and unlocked and represents our current position. */ for (;;) { /* * Make sure nl_nch is locked so we can access the vnode, resolution * state, etc. */ if ((nd->nl_flags & NLC_NCPISLOCKED) == 0) { nd->nl_flags |= NLC_NCPISLOCKED; cache_lock_maybe_shared(&nd->nl_nch, wantsexcllock(nd, ptr)); } /* * Check if the root directory should replace the current * directory. This is done at the start of a translation * or after a symbolic link has been found. In other cases * ptr will never be pointing at a '/'. */ if (*ptr == '/') { do { ++ptr; } while (*ptr == '/'); cache_unlock(&nd->nl_nch); cache_get_maybe_shared(&nd->nl_rootnch, &nch, wantsexcllock(nd, ptr)); if (nd->nl_flags & NLC_NCDIR) { cache_drop_ncdir(&nd->nl_nch); nd->nl_flags &= ~NLC_NCDIR; } else { cache_drop(&nd->nl_nch); } nd->nl_nch = nch; /* remains locked */ /* * Fast-track termination. There is no parent directory of * the root in the same mount from the point of view of * the caller so return EACCES if NLC_REFDVP is specified, * and EEXIST if NLC_CREATE is also specified. * e.g. 'rmdir /' or 'mkdir /' are not allowed. */ if (*ptr == 0) { if (nd->nl_flags & NLC_REFDVP) error = (nd->nl_flags & NLC_CREATE) ? EEXIST : EACCES; else error = 0; break; } continue; } /* * Pre-calculate next path component so we can check whether the * current component directory is the last directory in the path * or not. */ for (nptr = ptr; *nptr && *nptr != '/'; ++nptr) ; /* * Check directory search permissions (nd->nl_nch is locked & refd). * This will load dflags to obtain directory-special permissions to * be checked along with the last component. * * We only need to pass-in &dflags for the second-to-last component. * Optimize by passing-in NULL for any prior components, which may * allow the code to bypass the naccess() call. */ dflags = 0; if (*nptr == '/') error = naccess(&nd->nl_nch, NLC_EXEC, nd->nl_cred, NULL); else error = naccess(&nd->nl_nch, NLC_EXEC, nd->nl_cred, &dflags); if (error) { if (keeperror(nd, error)) break; error = 0; } /* * Extract the next (or last) path component. Path components are * limited to 255 characters. */ nlc.nlc_nameptr = ptr; nlc.nlc_namelen = nptr - ptr; ptr = nptr; if (nlc.nlc_namelen >= 256) { error = ENAMETOOLONG; break; } /* * Lookup the path component in the cache, creating an unresolved * entry if necessary. We have to handle "." and ".." as special * cases. * * When handling ".." we have to detect a traversal back through a * mount point. If we are at the root, ".." just returns the root. * * When handling "." or ".." we also have to recalculate dflags * since our dflags will be for some sub-directory instead of the * parent dir. * * This subsection returns a locked, refd 'nch' unless it errors out, * and an unlocked but still ref'd nd->nl_nch. * * The namecache topology is not allowed to be disconnected, so * encountering a NULL parent will generate EINVAL. This typically * occurs when a directory is removed out from under a process. * * WARNING! The unlocking of nd->nl_nch is sensitive code. */ KKASSERT(nd->nl_flags & NLC_NCPISLOCKED); if (nlc.nlc_namelen == 1 && nlc.nlc_nameptr[0] == '.') { cache_unlock(&nd->nl_nch); nd->nl_flags &= ~NLC_NCPISLOCKED; cache_get_maybe_shared(&nd->nl_nch, &nch, wantsexcllock(nd, ptr)); wasdotordotdot = 1; } else if (nlc.nlc_namelen == 2 && nlc.nlc_nameptr[0] == '.' && nlc.nlc_nameptr[1] == '.') { if (nd->nl_nch.mount == nd->nl_rootnch.mount && nd->nl_nch.ncp == nd->nl_rootnch.ncp ) { /* * ".." at the root returns the root */ cache_unlock(&nd->nl_nch); nd->nl_flags &= ~NLC_NCPISLOCKED; cache_get_maybe_shared(&nd->nl_nch, &nch, wantsexcllock(nd, ptr)); } else { /* * Locate the parent ncp. If we are at the root of a * filesystem mount we have to skip to the mounted-on * point in the underlying filesystem. * * Expect the parent to always be good since the * mountpoint doesn't go away. XXX hack. cache_get() * requires the ncp to already have a ref as a safety. * * However, a process which has been broken out of a chroot * will wind up with a NULL parent if it tries to '..' above * the real root, deal with the case. Note that this does * not protect us from a jail breakout, it just stops a panic * if the jail-broken process tries to '..' past the real * root. */ nctmp = nd->nl_nch; while (nctmp.ncp == nctmp.mount->mnt_ncmountpt.ncp) { nctmp = nctmp.mount->mnt_ncmounton; if (nctmp.ncp == NULL) break; } if (nctmp.ncp == NULL) { if (curthread->td_proc) { kprintf("vfs_nlookup: '..' traverse broke " "jail: pid %d (%s)\n", curthread->td_proc->p_pid, curthread->td_comm); } nctmp = nd->nl_rootnch; } else { nctmp.ncp = nctmp.ncp->nc_parent; } cache_hold(&nctmp); cache_unlock(&nd->nl_nch); nd->nl_flags &= ~NLC_NCPISLOCKED; cache_get_maybe_shared(&nctmp, &nch, wantsexcllock(nd, ptr)); cache_drop(&nctmp); /* NOTE: zero's nctmp */ } wasdotordotdot = 2; } else { /* * Must unlock nl_nch when traversing down the path. However, * the child ncp has not yet been found/created and the parent's * child list might be empty. Thus releasing the lock can * allow a race whereby the parent ncp's vnode is recycled. * This case can occur especially when maxvnodes is set very low. * * We need the parent's ncp to remain resolved for all normal * filesystem activities, so we vhold() the vp during the lookup * to prevent recyclement due to vnlru / maxvnodes. * * If we race an unlink or rename the ncp might be marked * DESTROYED after resolution, requiring a retry. */ if ((hvp = nd->nl_nch.ncp->nc_vp) != NULL) vhold(hvp); cache_unlock(&nd->nl_nch); nd->nl_flags &= ~NLC_NCPISLOCKED; error = cache_nlookup_maybe_shared(&nd->nl_nch, &nlc, wantsexcllock(nd, ptr), &nch); if (error == EWOULDBLOCK) { nch = cache_nlookup(&nd->nl_nch, &nlc); if (nch.ncp->nc_flag & NCF_UNRESOLVED) hit = 0; for (;;) { error = cache_resolve(&nch, nd->nl_cred); if (error != EAGAIN && (nch.ncp->nc_flag & NCF_DESTROYED) == 0) { if (error == ESTALE) { if (!inretry) error = ENOENT; doretry = TRUE; } break; } kprintf("[diagnostic] nlookup: relookup %*.*s\n", nch.ncp->nc_nlen, nch.ncp->nc_nlen, nch.ncp->nc_name); cache_put(&nch); nch = cache_nlookup(&nd->nl_nch, &nlc); } } if (hvp) vdrop(hvp); wasdotordotdot = 0; } /* * If the last component was "." or ".." our dflags no longer * represents the parent directory and we have to explicitly * look it up. * * Expect the parent to be good since nch is locked. */ if (wasdotordotdot && error == 0) { dflags = 0; if ((par.ncp = nch.ncp->nc_parent) != NULL) { par.mount = nch.mount; cache_hold(&par); cache_lock_maybe_shared(&par, wantsexcllock(nd, ptr)); error = naccess(&par, 0, nd->nl_cred, &dflags); cache_put(&par); if (error) { if (!keeperror(nd, error)) error = 0; } } } /* * [end of subsection] * * nch is locked and referenced. * nd->nl_nch is unlocked and referenced. * * nl_nch must be unlocked or we could chain lock to the root * if a resolve gets stuck (e.g. in NFS). */ KKASSERT((nd->nl_flags & NLC_NCPISLOCKED) == 0); /* * Resolve the namespace if necessary. The ncp returned by * cache_nlookup() is referenced and locked. * * XXX neither '.' nor '..' should return EAGAIN since they were * previously resolved and thus cannot be newly created ncp's. */ if (nch.ncp->nc_flag & NCF_UNRESOLVED) { hit = 0; error = cache_resolve(&nch, nd->nl_cred); if (error == ESTALE) { if (!inretry) error = ENOENT; doretry = TRUE; } KKASSERT(error != EAGAIN); } else { error = nch.ncp->nc_error; } /* * Early completion. ENOENT is not an error if this is the last * component and NLC_CREATE or NLC_RENAME (rename target) was * requested. Note that ncp->nc_error is left as ENOENT in that * case, which we check later on. * * Also handle invalid '.' or '..' components terminating a path * for a create/rename/delete. The standard requires this and pax * pretty stupidly depends on it. */ if (islastelement(ptr)) { if (error == ENOENT && (nd->nl_flags & (NLC_CREATE | NLC_RENAME_DST)) ) { if (nd->nl_flags & NLC_NFS_RDONLY) { error = EROFS; } else { error = naccess(&nch, nd->nl_flags | dflags, nd->nl_cred, NULL); } } if (error == 0 && wasdotordotdot && (nd->nl_flags & (NLC_CREATE | NLC_DELETE | NLC_RENAME_SRC | NLC_RENAME_DST))) { /* * POSIX junk */ if (nd->nl_flags & NLC_CREATE) error = EEXIST; else if (nd->nl_flags & NLC_DELETE) error = (wasdotordotdot == 1) ? EINVAL : ENOTEMPTY; else error = EINVAL; } } /* * Early completion on error. */ if (error) { cache_put(&nch); break; } /* * If the element is a symlink and it is either not the last * element or it is the last element and we are allowed to * follow symlinks, resolve the symlink. */ if ((nch.ncp->nc_flag & NCF_ISSYMLINK) && (*ptr || (nd->nl_flags & NLC_FOLLOW)) ) { if (nd->nl_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; cache_put(&nch); break; } error = nreadsymlink(nd, &nch, &nlc); cache_put(&nch); if (error) break; /* * Concatenate trailing path elements onto the returned symlink. * Note that if the path component (ptr) is not exhausted, it * will being with a '/', so we do not have to add another one. * * The symlink may not be empty. */ len = strlen(ptr); if (nlc.nlc_namelen == 0 || nlc.nlc_namelen + len >= MAXPATHLEN) { error = nlc.nlc_namelen ? ENAMETOOLONG : ENOENT; objcache_put(namei_oc, nlc.nlc_nameptr); break; } bcopy(ptr, nlc.nlc_nameptr + nlc.nlc_namelen, len + 1); if (nd->nl_flags & NLC_HASBUF) objcache_put(namei_oc, nd->nl_path); nd->nl_path = nlc.nlc_nameptr; nd->nl_flags |= NLC_HASBUF; ptr = nd->nl_path; /* * Go back up to the top to resolve any initial '/'s in the * symlink. */ continue; } /* * If the element is a directory and we are crossing a mount point, * Locate the mount. */ while ((nch.ncp->nc_flag & NCF_ISMOUNTPT) && (nd->nl_flags & NLC_NOCROSSMOUNT) == 0 && (mp = cache_findmount(&nch)) != NULL ) { struct vnode *tdp; int vfs_do_busy = 0; /* * VFS must be busied before the namecache entry is locked, * but we don't want to waste time calling vfs_busy() if the * mount point is already resolved. */ again: cache_put(&nch); if (vfs_do_busy) { while (vfs_busy(mp, 0)) { if (mp->mnt_kern_flag & MNTK_UNMOUNT) { kprintf("nlookup: warning umount race avoided\n"); cache_dropmount(mp); error = EBUSY; vfs_do_busy = 0; goto double_break; } } } cache_get_maybe_shared(&mp->mnt_ncmountpt, &nch, wantsexcllock(nd, ptr)); if (nch.ncp->nc_flag & NCF_UNRESOLVED) { if (vfs_do_busy == 0) { vfs_do_busy = 1; goto again; } error = VFS_ROOT(mp, &tdp); vfs_unbusy(mp); vfs_do_busy = 0; if (keeperror(nd, error)) { cache_dropmount(mp); break; } if (error == 0) { cache_setvp(&nch, tdp); vput(tdp); } } if (vfs_do_busy) vfs_unbusy(mp); cache_dropmount(mp); } if (keeperror(nd, error)) { cache_put(&nch); double_break: break; } /* * Skip any slashes to get to the next element. If there * are any slashes at all the current element must be a * directory or, in the create case, intended to become a directory. * If it isn't we break without incrementing ptr and fall through * to the failure case below. */ while (*ptr == '/') { if ((nch.ncp->nc_flag & NCF_ISDIR) == 0 && !(nd->nl_flags & NLC_WILLBEDIR) ) { break; } ++ptr; } /* * Continuation case: additional elements and the current * element is a directory. */ if (*ptr && (nch.ncp->nc_flag & NCF_ISDIR)) { if (nd->nl_flags & NLC_NCDIR) { cache_drop_ncdir(&nd->nl_nch); nd->nl_flags &= ~NLC_NCDIR; } else { cache_drop(&nd->nl_nch); } cache_unlock(&nch); KKASSERT((nd->nl_flags & NLC_NCPISLOCKED) == 0); nd->nl_nch = nch; continue; } /* * Failure case: additional elements and the current element * is not a directory */ if (*ptr) { cache_put(&nch); error = ENOTDIR; break; } /* * Successful lookup of last element. * * Check permissions if the target exists. If the target does not * exist directory permissions were already tested in the early * completion code above. * * nd->nl_flags will be adjusted on return with NLC_APPENDONLY * if the file is marked append-only, and NLC_STICKY if the directory * containing the file is sticky. */ if (nch.ncp->nc_vp && (nd->nl_flags & NLC_ALLCHKS)) { error = naccess(&nch, nd->nl_flags | dflags, nd->nl_cred, NULL); if (keeperror(nd, error)) { cache_put(&nch); break; } } /* * Termination: no more elements. * * If NLC_REFDVP is set acquire a referenced parent dvp. */ if (nd->nl_flags & NLC_REFDVP) { cache_lock(&nd->nl_nch); error = cache_vref(&nd->nl_nch, nd->nl_cred, &nd->nl_dvp); cache_unlock(&nd->nl_nch); if (keeperror(nd, error)) { kprintf("NLC_REFDVP: Cannot ref dvp of %p\n", nch.ncp); cache_put(&nch); break; } } if (nd->nl_flags & NLC_NCDIR) { cache_drop_ncdir(&nd->nl_nch); nd->nl_flags &= ~NLC_NCDIR; } else { cache_drop(&nd->nl_nch); } nd->nl_nch = nch; nd->nl_flags |= NLC_NCPISLOCKED; error = 0; break; } if (hit) ++gd->gd_nchstats->ncs_longhits; else ++gd->gd_nchstats->ncs_longmiss; if (nd->nl_flags & NLC_NCPISLOCKED) KKASSERT(cache_lockstatus(&nd->nl_nch) > 0); /* * Retry the whole thing if doretry flag is set, but only once. * autofs(5) may mount another filesystem under its root directory * while resolving a path. */ if (doretry && !inretry) { inretry = TRUE; nd->nl_flags &= NLC_NCDIR; nd->nl_flags |= saveflag; goto nlookup_start; } /* * NOTE: If NLC_CREATE was set the ncp may represent a negative hit * (ncp->nc_error will be ENOENT), but we will still return an error * code of 0. */ return(error); }
/* * Allocates a new vnode for the node node or returns a new reference to * an existing one if the node had already a vnode referencing it. The * resulting locked vnode is returned in *vpp. * * Returns zero on success or an appropriate error code on failure. * * The caller must ensure that node cannot go away (usually by holding * the related directory entry). * * If dnode is non-NULL this routine avoids deadlocking against it but * can return EAGAIN. Caller must try again. The dnode lock will cycle * in this case, it remains locked on return in all cases. dnode must * be shared-locked. */ int tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *dnode, struct tmpfs_node *node, int lkflag, struct vnode **vpp) { int error = 0; struct vnode *vp; loop: /* * Interlocked extraction from node. This can race many things. * We have to get a soft reference on the vnode while we hold * the node locked, then acquire it properly and check for races. */ TMPFS_NODE_LOCK(node); if ((vp = node->tn_vnode) != NULL) { KKASSERT((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0); vhold(vp); TMPFS_NODE_UNLOCK(node); if (dnode) { /* * Special-case handling to avoid deadlocking against * dnode. This case has been validated and occurs * every so often during synth builds. */ if (vget(vp, (lkflag & ~LK_RETRY) | LK_NOWAIT | LK_EXCLUSIVE) != 0) { TMPFS_NODE_UNLOCK(dnode); if (vget(vp, (lkflag & ~LK_RETRY) | LK_SLEEPFAIL | LK_EXCLUSIVE) == 0) { vn_unlock(vp); } vdrop(vp); TMPFS_NODE_LOCK_SH(dnode); return EAGAIN; } } else { /* * Normal path */ if (vget(vp, lkflag | LK_EXCLUSIVE) != 0) { vdrop(vp); goto loop; } } if (node->tn_vnode != vp) { vput(vp); vdrop(vp); goto loop; } vdrop(vp); goto out; } /* vp is NULL */ /* * This should never happen. */ if (node->tn_vpstate & TMPFS_VNODE_DOOMED) { TMPFS_NODE_UNLOCK(node); error = ENOENT; goto out; } /* * Interlock against other calls to tmpfs_alloc_vp() trying to * allocate and assign a vp to node. */ if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { node->tn_vpstate |= TMPFS_VNODE_WANT; error = tsleep(&node->tn_vpstate, PINTERLOCKED | PCATCH, "tmpfs_alloc_vp", 0); TMPFS_NODE_UNLOCK(node); if (error) return error; goto loop; } node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; TMPFS_NODE_UNLOCK(node); /* * Allocate a new vnode (may block). The ALLOCATING flag should * prevent a race against someone else assigning node->tn_vnode. */ error = getnewvnode(VT_TMPFS, mp, &vp, VLKTIMEOUT, LK_CANRECURSE); if (error != 0) goto unlock; KKASSERT(node->tn_vnode == NULL); KKASSERT(vp != NULL); vp->v_data = node; vp->v_type = node->tn_type; /* Type-specific initialization. */ switch (node->tn_type) { case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VSOCK: break; case VREG: /* * VMIO is mandatory. Tmpfs also supports KVABIO * for its tmpfs_strategy(). */ vsetflags(vp, VKVABIO); vinitvmio(vp, node->tn_size, TMPFS_BLKSIZE, -1); break; case VLNK: break; case VFIFO: vp->v_ops = &mp->mnt_vn_fifo_ops; break; case VDIR: break; default: panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); } unlock: TMPFS_NODE_LOCK(node); KKASSERT(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; node->tn_vnode = vp; if (node->tn_vpstate & TMPFS_VNODE_WANT) { node->tn_vpstate &= ~TMPFS_VNODE_WANT; TMPFS_NODE_UNLOCK(node); wakeup(&node->tn_vpstate); } else { TMPFS_NODE_UNLOCK(node); } out: *vpp = vp; KKASSERT(IFF(error == 0, *vpp != NULL && vn_islocked(*vpp))); return error; }
/* * We have to carry on the locking protocol on the null layer vnodes * as we progress through the tree. We also have to enforce read-only * if this layer is mounted read-only. */ static int null_lookup(struct vop_lookup_args *ap) { struct componentname *cnp = ap->a_cnp; struct vnode *dvp = ap->a_dvp; int flags = cnp->cn_flags; struct vnode *vp, *ldvp, *lvp; struct mount *mp; int error; mp = dvp->v_mount; if ((flags & ISLASTCN) != 0 && (mp->mnt_flag & MNT_RDONLY) != 0 && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); /* * Although it is possible to call null_bypass(), we'll do * a direct call to reduce overhead */ ldvp = NULLVPTOLOWERVP(dvp); vp = lvp = NULL; KASSERT((ldvp->v_vflag & VV_ROOT) == 0 || ((dvp->v_vflag & VV_ROOT) != 0 && (flags & ISDOTDOT) == 0), ("ldvp %p fl %#x dvp %p fl %#x flags %#x", ldvp, ldvp->v_vflag, dvp, dvp->v_vflag, flags)); /* * Hold ldvp. The reference on it, owned by dvp, is lost in * case of dvp reclamation, and we need ldvp to move our lock * from ldvp to dvp. */ vhold(ldvp); error = VOP_LOOKUP(ldvp, &lvp, cnp); /* * VOP_LOOKUP() on lower vnode may unlock ldvp, which allows * dvp to be reclaimed due to shared v_vnlock. Check for the * doomed state and return error. */ if ((error == 0 || error == EJUSTRETURN) && (dvp->v_iflag & VI_DOOMED) != 0) { error = ENOENT; if (lvp != NULL) vput(lvp); /* * If vgone() did reclaimed dvp before curthread * relocked ldvp, the locks of dvp and ldpv are no * longer shared. In this case, relock of ldvp in * lower fs VOP_LOOKUP() does not restore the locking * state of dvp. Compensate for this by unlocking * ldvp and locking dvp, which is also correct if the * locks are still shared. */ VOP_UNLOCK(ldvp, 0); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); } vdrop(ldvp); if (error == EJUSTRETURN && (flags & ISLASTCN) != 0 && (mp->mnt_flag & MNT_RDONLY) != 0 && (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME)) error = EROFS; if ((error == 0 || error == EJUSTRETURN) && lvp != NULL) { if (ldvp == lvp) { *ap->a_vpp = dvp; VREF(dvp); vrele(lvp); } else { error = null_nodeget(mp, lvp, &vp); if (error == 0) *ap->a_vpp = vp; } } return (error); }