int ttm_tt_swapin(struct ttm_tt *ttm) { vm_object_t obj; vm_page_t from_page, to_page; int i, ret, rv; obj = ttm->swap_storage; VM_OBJECT_LOCK(obj); vm_object_pip_add(obj, 1); for (i = 0; i < ttm->num_pages; ++i) { from_page = vm_page_grab(obj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (from_page->valid != VM_PAGE_BITS_ALL) { if (vm_pager_has_page(obj, i)) { rv = vm_pager_get_page(obj, &from_page, 1); if (rv != VM_PAGER_OK) { vm_page_free(from_page); ret = -EIO; goto err_ret; } } else { vm_page_zero_invalid(from_page, TRUE); } } to_page = ttm->pages[i]; if (unlikely(to_page == NULL)) { ret = -ENOMEM; vm_page_wakeup(from_page); goto err_ret; } pmap_copy_page(VM_PAGE_TO_PHYS(from_page), VM_PAGE_TO_PHYS(to_page)); vm_page_wakeup(from_page); } vm_object_pip_wakeup(obj); VM_OBJECT_UNLOCK(obj); if (!(ttm->page_flags & TTM_PAGE_FLAG_PERSISTENT_SWAP)) vm_object_deallocate(obj); ttm->swap_storage = NULL; ttm->page_flags &= ~TTM_PAGE_FLAG_SWAPPED; return (0); err_ret: vm_object_pip_wakeup(obj); VM_OBJECT_UNLOCK(obj); return (ret); }
/* --------------------------------------------------------------------- */ static int tmpfs_nocacheread(vm_object_t tobj, vm_pindex_t idx, vm_offset_t offset, size_t tlen, struct uio *uio) { vm_page_t m; int error; VM_OBJECT_LOCK(tobj); vm_object_pip_add(tobj, 1); m = vm_page_grab(tobj, idx, VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (m->valid != VM_PAGE_BITS_ALL) { if (vm_pager_has_page(tobj, idx, NULL, NULL)) { error = vm_pager_get_pages(tobj, &m, 1, 0); if (error != 0) { printf("tmpfs get pages from pager error [read]\n"); goto out; } } else vm_page_zero_invalid(m, TRUE); } VM_OBJECT_UNLOCK(tobj); error = uiomove_fromphys(&m, offset, tlen, uio); VM_OBJECT_LOCK(tobj); out: vm_page_lock(m); vm_page_unwire(m, TRUE); vm_page_unlock(m); vm_page_wakeup(m); vm_object_pip_subtract(tobj, 1); VM_OBJECT_UNLOCK(tobj); return (error); }
/* --------------------------------------------------------------------- */ static int tmpfs_nocacheread(vm_object_t tobj, vm_pindex_t idx, vm_offset_t offset, size_t tlen, struct uio *uio) { vm_page_t m; int error, rv; VM_OBJECT_LOCK(tobj); m = vm_page_grab(tobj, idx, VM_ALLOC_WIRED | VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (m->valid != VM_PAGE_BITS_ALL) { if (vm_pager_has_page(tobj, idx, NULL, NULL)) { rv = vm_pager_get_pages(tobj, &m, 1, 0); if (rv != VM_PAGER_OK) { vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); VM_OBJECT_UNLOCK(tobj); return (EIO); } } else vm_page_zero_invalid(m, TRUE); } VM_OBJECT_UNLOCK(tobj); error = uiomove_fromphys(&m, offset, tlen, uio); VM_OBJECT_LOCK(tobj); vm_page_lock(m); vm_page_unwire(m, TRUE); vm_page_unlock(m); vm_page_wakeup(m); VM_OBJECT_UNLOCK(tobj); return (error); }
vm_page_t shmem_read_mapping_page(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; int rv; VM_OBJECT_LOCK_ASSERT_OWNED(object); m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (m->valid != VM_PAGE_BITS_ALL) { if (vm_pager_has_page(object, pindex)) { rv = vm_pager_get_page(object, &m, 1); m = vm_page_lookup(object, pindex); if (m == NULL) return ERR_PTR(-ENOMEM); if (rv != VM_PAGER_OK) { vm_page_free(m); return ERR_PTR(-ENOMEM); } } else { pmap_zero_page(VM_PAGE_TO_PHYS(m)); m->valid = VM_PAGE_BITS_ALL; m->dirty = 0; } } vm_page_wire(m); vm_page_wakeup(m); return (m); }
/* * Release a page busied for a getpages operation. The page may have become * wired (typically due to being used by the buffer cache) or otherwise been * soft-busied and cannot be freed in that case. A held page can still be * freed. */ void vnode_pager_freepage(vm_page_t m) { if (m->busy || m->wire_count || (m->flags & PG_NEED_COMMIT)) { vm_page_activate(m); vm_page_wakeup(m); } else { vm_page_free(m); } }
static inline void release_page(struct faultstate *fs) { vm_page_wakeup(fs->m); vm_page_lock(fs->m); vm_page_deactivate(fs->m); vm_page_unlock(fs->m); fs->m = NULL; }
/* XXX */ void cdev_pager_free_page(vm_object_t object, vm_page_t m) { if (object->type == OBJT_MGTDEVICE) { KKASSERT((m->flags & PG_FICTITIOUS) != 0); pmap_page_protect(m, VM_PROT_NONE); vm_page_remove(m); vm_page_wakeup(m); } else if (object->type == OBJT_DEVICE) { TAILQ_REMOVE(&object->un_pager.devp.devp_pglist, m, pageq); dev_pager_putfake(m); } }
int ttm_tt_swapout(struct ttm_tt *ttm, vm_object_t persistent_swap_storage) { vm_object_t obj; vm_page_t from_page, to_page; int i; BUG_ON(ttm->state != tt_unbound && ttm->state != tt_unpopulated); BUG_ON(ttm->caching_state != tt_cached); if (!persistent_swap_storage) { obj = swap_pager_alloc(NULL, IDX_TO_OFF(ttm->num_pages), VM_PROT_DEFAULT, 0); if (obj == NULL) { pr_err("Failed allocating swap storage\n"); return (-ENOMEM); } } else obj = persistent_swap_storage; VM_OBJECT_LOCK(obj); vm_object_pip_add(obj, 1); for (i = 0; i < ttm->num_pages; ++i) { from_page = ttm->pages[i]; if (unlikely(from_page == NULL)) continue; to_page = vm_page_grab(obj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); pmap_copy_page(VM_PAGE_TO_PHYS(from_page), VM_PAGE_TO_PHYS(to_page)); to_page->valid = VM_PAGE_BITS_ALL; vm_page_dirty(to_page); vm_page_wakeup(to_page); } vm_object_pip_wakeup(obj); VM_OBJECT_UNLOCK(obj); ttm->bdev->driver->ttm_tt_unpopulate(ttm); ttm->swap_storage = obj; ttm->page_flags |= TTM_PAGE_FLAG_SWAPPED; if (persistent_swap_storage) ttm->page_flags |= TTM_PAGE_FLAG_PERSISTENT_SWAP; return 0; }
int ttm_tt_swapout(struct ttm_tt *ttm, vm_object_t persistent_swap_storage) { vm_object_t obj; vm_page_t from_page, to_page; int i; MPASS(ttm->state == tt_unbound || ttm->state == tt_unpopulated); MPASS(ttm->caching_state == tt_cached); if (persistent_swap_storage == NULL) { obj = vm_pager_allocate(OBJT_SWAP, NULL, IDX_TO_OFF(ttm->num_pages), VM_PROT_DEFAULT, 0, curthread->td_ucred); if (obj == NULL) { printf("[TTM] Failed allocating swap storage\n"); return (-ENOMEM); } } else obj = persistent_swap_storage; VM_OBJECT_WLOCK(obj); vm_object_pip_add(obj, 1); for (i = 0; i < ttm->num_pages; ++i) { from_page = ttm->pages[i]; if (unlikely(from_page == NULL)) continue; to_page = vm_page_grab(obj, i, VM_ALLOC_RETRY); pmap_copy_page(from_page, to_page); vm_page_dirty(to_page); to_page->valid = VM_PAGE_BITS_ALL; vm_page_wakeup(to_page); } vm_object_pip_wakeup(obj); VM_OBJECT_WUNLOCK(obj); ttm->bdev->driver->ttm_tt_unpopulate(ttm); ttm->swap_storage = obj; ttm->page_flags |= TTM_PAGE_FLAG_SWAPPED; if (persistent_swap_storage != NULL) ttm->page_flags |= TTM_PAGE_FLAG_PERSISTENT_SWAP; return (0); }
/* * Fill as many pages as vm_fault has allocated for us. */ static int phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage) { int i; VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); for (i = 0; i < count; i++) { if (m[i]->valid == 0) { if ((m[i]->flags & PG_ZERO) == 0) pmap_zero_page(m[i]); m[i]->valid = VM_PAGE_BITS_ALL; } KASSERT(m[i]->valid == VM_PAGE_BITS_ALL, ("phys_pager_getpages: partially valid page %p", m[i])); KASSERT(m[i]->dirty == 0, ("phys_pager_getpages: dirty page %p", m[i])); /* The requested page must remain busy, the others not. */ if (i == reqpage) vm_page_flash(m[i]); else vm_page_wakeup(m[i]); } return (VM_PAGER_OK); }
/* * Vnode op for VM getpages. * Wish wish .... get rid from multiple IO routines * * nwfs_getpages(struct vnode *a_vp, vm_page_t *a_m, int a_count, * int a_reqpage, vm_ooffset_t a_offset) */ int nwfs_getpages(struct vop_getpages_args *ap) { #ifndef NWFS_RWCACHE return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage, ap->a_seqaccess); #else int i, error, npages; size_t nextoff, toff; size_t count; size_t size; struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; struct vnode *vp; struct thread *td = curthread; /* XXX */ struct ucred *cred; struct nwmount *nmp; struct nwnode *np; vm_page_t *pages; KKASSERT(td->td_proc); cred = td->td_proc->p_ucred; vp = ap->a_vp; np = VTONW(vp); nmp = VFSTONWFS(vp->v_mount); pages = ap->a_m; count = (size_t)ap->a_count; if (vp->v_object == NULL) { kprintf("nwfs_getpages: called with non-merged cache vnode??\n"); return VM_PAGER_ERROR; } bp = getpbuf_kva(&nwfs_pbuf_freecnt); npages = btoc(count); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); iov.iov_base = (caddr_t) kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; error = ncp_read(NWFSTOCONN(nmp), &np->n_fh, &uio,cred); pmap_qremove(kva, npages); relpbuf(bp, &nwfs_pbuf_freecnt); if (error && (uio.uio_resid == count)) { kprintf("nwfs_getpages: error %d\n",error); for (i = 0; i < npages; i++) { if (ap->a_reqpage != i) vnode_pager_freepage(pages[i]); } return VM_PAGER_ERROR; } size = count - uio.uio_resid; for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { vm_page_t m; nextoff = toff + PAGE_SIZE; m = pages[i]; m->flags &= ~PG_ZERO; /* * NOTE: pmap dirty bit should have already been cleared. * We do not clear it here. */ if (nextoff <= size) { m->valid = VM_PAGE_BITS_ALL; m->dirty = 0; } else { int nvalid = ((size + DEV_BSIZE - 1) - toff) & ~(DEV_BSIZE - 1); vm_page_set_validclean(m, 0, nvalid); } if (i != ap->a_reqpage) { /* * Whether or not to leave the page activated is up in * the air, but we should put the page on a page queue * somewhere (it already is in the object). Result: * It appears that emperical results show that * deactivating pages is best. */ /* * Just in case someone was asking for this page we * now tell them that it is ok to use. */ if (!error) { if (m->flags & PG_REFERENCED) vm_page_activate(m); else vm_page_deactivate(m); vm_page_wakeup(m); } else { vnode_pager_freepage(m); } } } return 0; #endif /* NWFS_RWCACHE */ }
int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { vm_prot_t prot; long ahead, behind; int alloc_req, era, faultcount, nera, reqpage, result; boolean_t growstack, is_first_object_locked, wired; int map_generation; vm_object_t next_object; vm_page_t marray[VM_FAULT_READ_MAX]; int hardfault; struct faultstate fs; struct vnode *vp; int locked, error; hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); fs.vp = NULL; faultcount = reqpage = 0; RetryFault:; /* * Find the backing store object and offset into it to begin the * search. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { if (growstack && result == KERN_INVALID_ADDRESS && map != kernel_map) { result = vm_map_growstack(curproc, vaddr); if (result != KERN_SUCCESS) return (KERN_FAILURE); growstack = FALSE; goto RetryFault; } return (result); } map_generation = fs.map->timestamp; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { panic("vm_fault: fault on nofault entry, addr: %lx", (u_long)vaddr); } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. This must be done after * obtaining the vnode lock in order to avoid possible deadlocks. */ VM_OBJECT_WLOCK(fs.first_object); vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.lookup_still_valid = TRUE; if (wired) fault_type = prot | (fault_type & VM_PROT_COPY); fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { /* * If the object is dead, we stop here */ if (fs.object->flags & OBJ_DEAD) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { /* * check for page-based copy on write. * We check fs.object == fs.first_object so * as to ensure the legacy COW mechanism is * used when the page in question is part of * a shadow object. Otherwise, vm_page_cowfault() * removes the page from the backing object, * which is not what we want. */ vm_page_lock(fs.m); if ((fs.m->cow) && (fault_type & VM_PROT_WRITE) && (fs.object == fs.first_object)) { vm_page_cowfault(fs.m); unlock_and_deallocate(&fs); goto RetryFault; } /* * Wait/Retry if the page is busy. We have to do this * if the page is busy via either VPO_BUSY or * vm_page_t->busy because the vm_pager may be using * vm_page_t->busy for pageouts ( and even pageins if * it is the vnode pager ), and we could end up trying * to pagein and pageout the same page simultaneously. * * We can theoretically allow the busy case on a read * fault if the page is marked valid, but since such * pages are typically already pmap'd, putting that * special case in might be more effort then it is * worth. We cannot under any circumstances mess * around with a vm_page_t->busy page except, perhaps, * to pmap it. */ if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs.m, PGA_REFERENCED); vm_page_unlock(fs.m); if (fs.object != fs.first_object) { if (!VM_OBJECT_TRYWLOCK( fs.first_object)) { VM_OBJECT_WUNLOCK(fs.object); VM_OBJECT_WLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.object); } vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); vm_object_pip_wakeup(fs.first_object); VM_OBJECT_WUNLOCK(fs.first_object); fs.first_m = NULL; } unlock_map(&fs); if (fs.m == vm_page_lookup(fs.object, fs.pindex)) { vm_page_sleep_if_busy(fs.m, TRUE, "vmpfw"); } vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); PCPU_INC(cnt.v_intrans); vm_object_deallocate(fs.first_object); goto RetryFault; } vm_page_remque(fs.m); vm_page_unlock(fs.m); /* * Mark page busy for other processes, and the * pagedaemon. If it still isn't completely valid * (readable), jump to readrest, else break-out ( we * found the page ). */ vm_page_busy(fs.m); if (fs.m->valid != VM_PAGE_BITS_ALL) goto readrest; break; } /* * Page is not resident, If this is the search termination * or the pager might contain the page, allocate a new page. */ if (TRYPAGER || fs.object == fs.first_object) { if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At * worst, the P_KILLED might be not observed * there, and allocation can fail, causing * restart and new reading of the p_flag. */ fs.m = NULL; if (!vm_page_count_severe() || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 if ((fs.object->flags & OBJ_COLORED) == 0) { fs.object->flags |= OBJ_COLORED; fs.object->pg_color = atop(vaddr) - fs.pindex; } #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs.object->type != OBJT_VNODE && fs.object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; fs.m = vm_page_alloc(fs.object, fs.pindex, alloc_req); } if (fs.m == NULL) { unlock_and_deallocate(&fs); VM_WAITPFAULT; goto RetryFault; } else if (fs.m->valid == VM_PAGE_BITS_ALL) break; } readrest: /* * We have found a valid page or we have allocated a new page. * The page thus may not be valid or may not be entirely * valid. * * Attempt to fault-in the page if there is a chance that the * pager has it, and potentially fault in additional pages * at the same time. */ if (TRYPAGER) { int rv; u_char behavior = vm_map_entry_behavior(fs.entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { behind = 0; ahead = atop(fs.entry->end - vaddr) - 1; if (ahead > VM_FAULT_READ_AHEAD_MAX) ahead = VM_FAULT_READ_AHEAD_MAX; if (fs.pindex == fs.entry->next_read) vm_fault_cache_behind(&fs, VM_FAULT_READ_MAX); } else { /* * If this is a sequential page fault, then * arithmetically increase the number of pages * in the read-ahead window. Otherwise, reset * the read-ahead window to its smallest size. */ behind = atop(vaddr - fs.entry->start); if (behind > VM_FAULT_READ_BEHIND) behind = VM_FAULT_READ_BEHIND; ahead = atop(fs.entry->end - vaddr) - 1; era = fs.entry->read_ahead; if (fs.pindex == fs.entry->next_read) { nera = era + behind; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; behind = 0; if (ahead > nera) ahead = nera; if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_cache_behind(&fs, VM_FAULT_CACHE_BEHIND); } else if (ahead > VM_FAULT_READ_AHEAD_MIN) ahead = VM_FAULT_READ_AHEAD_MIN; if (era != ahead) fs.entry->read_ahead = ahead; } /* * Call the pager to retrieve the data, if any, after * releasing the lock on the map. We hold a ref on * fs.object and the pages are VPO_BUSY'd. */ unlock_map(&fs); if (fs.object->type == OBJT_VNODE) { vp = fs.object->handle; if (vp == fs.vp) goto vnode_locked; else if (fs.vp != NULL) { vput(fs.vp); fs.vp = NULL; } locked = VOP_ISLOCKED(vp); if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* Do not sleep for vnode lock while fs.m is busy */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error != 0) { vhold(vp); release_page(&fs); unlock_and_deallocate(&fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); vdrop(vp); fs.vp = vp; KASSERT(error == 0, ("vm_fault: vget failed")); goto RetryFault; } fs.vp = vp; } vnode_locked: KASSERT(fs.vp == NULL || !fs.map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * now we find out if any other pages should be paged * in at this time this routine checks to see if the * pages surrounding this fault reside in the same * object as the page for this fault. If they do, * then they are faulted in also into the object. The * array "marray" returned contains an array of * vm_page_t structs where one of them is the * vm_page_t passed to the routine. The reqpage * return value is the index into the marray for the * vm_page_t passed to the routine. * * fs.m plus the additional pages are VPO_BUSY'd. */ faultcount = vm_fault_additional_pages( fs.m, behind, ahead, marray, &reqpage); rv = faultcount ? vm_pager_get_pages(fs.object, marray, faultcount, reqpage) : VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* * Found the page. Leave it busy while we play * with it. */ /* * Relookup in case pager changed page. Pager * is responsible for disposition of old page * if moved. */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (!fs.m) { unlock_and_deallocate(&fs); goto RetryFault; } hardfault++; break; /* break to PAGE HAS BEEN FOUND */ } /* * Remove the bogus page (which does not exist at this * object/offset); before doing so, we must get back * our object lock to preserve our invariant. * * Also wake up any other process that may want to bring * in this page. * * If this is the top-level object, we must leave the * busy page to prevent another process from rushing * past us, and inserting the page in that object at * the same time that we are. */ if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * Data outside the range of the pager or an I/O error */ /* * XXX - the check for kernel_map is a kludge to work * around having the machine panic on a kernel space * fault w/ I/O error. */ if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } if (fs.object != fs.first_object) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; /* * XXX - we cannot just fall out at this * point, m has been freed and is invalid! */ } } /* * We get here if the object has default pager (or unwiring) * or the pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); next_object = fs.object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; VM_OBJECT_WLOCK(fs.object); } fs.first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs.m->flags & PG_ZERO) == 0) { pmap_zero_page(fs.m); } else { PCPU_INC(cnt.v_ozfod); } PCPU_INC(cnt.v_zfod); fs.m->valid = VM_PAGE_BITS_ALL; break; /* break to PAGE HAS BEEN FOUND */ } else { KASSERT(fs.object != next_object, ("object loop %p", next_object)); VM_OBJECT_WLOCK(next_object); vm_object_pip_add(next_object, 1); if (fs.object != fs.first_object) vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = next_object; } } KASSERT((fs.m->oflags & VPO_BUSY) != 0, ("vm_fault: not busy after main loop")); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { /* * This allows pages to be virtually copied from a * backing_object into the first_object, where the * backing object has no other refs to it, and cannot * gain any more refs. Instead of a bcopy, we just * move the page from the backing object to the * first object. Note that we must mark the page * dirty in the first object so that it will go out * to swap when needed. */ is_first_object_locked = FALSE; if ( /* * Only one shadow object */ (fs.object->shadow_count == 1) && /* * No COW refs, except us */ (fs.object->ref_count == 1) && /* * No one else can look this object up */ (fs.object->handle == NULL) && /* * No other ways to look the object up */ ((fs.object->type == OBJT_DEFAULT) || (fs.object->type == OBJT_SWAP)) && (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) && /* * We don't chase down the shadow chain */ fs.object == fs.first_object->backing_object) { /* * get rid of the unnecessary page */ vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); /* * grab the page and put it into the * process'es object. The page is * automatically made dirty. */ vm_page_lock(fs.m); vm_page_rename(fs.m, fs.first_object, fs.first_pindex); vm_page_unlock(fs.m); vm_page_busy(fs.m); fs.first_m = fs.m; fs.m = NULL; PCPU_INC(cnt.v_cow_optim); } else { /* * Oh, well, lets copy it. */ pmap_copy_page(fs.m, fs.first_m); fs.first_m->valid = VM_PAGE_BITS_ALL; if (wired && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) { vm_page_lock(fs.first_m); vm_page_wire(fs.first_m); vm_page_unlock(fs.first_m); vm_page_lock(fs.m); vm_page_unwire(fs.m, FALSE); vm_page_unlock(fs.m); } /* * We no longer need the old page or object. */ release_page(&fs); } /* * fs.object != fs.first_object due to above * conditional */ vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); /* * Only use the new page below... */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; if (!is_first_object_locked) VM_OBJECT_WLOCK(fs.object); PCPU_INC(cnt.v_cow_faults); curthread->td_cow++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } fs.lookup_still_valid = TRUE; if (fs.map->timestamp != map_generation) { result = vm_map_lookup_locked(&fs.map, vaddr, fault_type, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); /* * If we don't need the page any longer, put it on the inactive * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { release_page(&fs); unlock_and_deallocate(&fs); /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) goto RetryFault; return (result); } if ((retry_object != fs.first_object) || (retry_pindex != fs.first_pindex)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; } } /* * If the page was filled by a pager, update the map entry's * last read offset. Since the pager does not return the * actual set of pages that it read, this update is based on * the requested set. Typically, the requested and actual * sets are the same. * * XXX The following assignment modifies the map * without holding a write lock on it. */ if (hardfault) fs.entry->next_read = fs.pindex + faultcount - reqpage; if ((prot & VM_PROT_WRITE) != 0 || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_object_set_writeable_dirty(fs.object); /* * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if (fs.entry->eflags & MAP_ENTRY_NOSYNC) { if (fs.m->dirty == 0) fs.m->oflags |= VPO_NOSYNC; } else { fs.m->oflags &= ~VPO_NOSYNC; } /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also tell the backing pager, if any, that it should remove * any swap backing since the page is now dirty. */ if (((fault_type & VM_PROT_WRITE) != 0 && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_page_dirty(fs.m); vm_pager_page_unswapped(fs.m); } } /* * Page had better still be busy */ KASSERT(fs.m->oflags & VPO_BUSY, ("vm_fault: page %p not busy!", fs.m)); /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ KASSERT(fs.m->valid == VM_PAGE_BITS_ALL, ("vm_fault: page %p partially invalid", fs.m)); VM_OBJECT_WUNLOCK(fs.object); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired); if ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 && wired == 0) vm_fault_prefault(fs.map->pmap, vaddr, fs.entry); VM_OBJECT_WLOCK(fs.object); vm_page_lock(fs.m); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if (fault_flags & VM_FAULT_CHANGE_WIRING) { if (wired) vm_page_wire(fs.m); else vm_page_unwire(fs.m, 1); } else vm_page_activate(fs.m); if (m_hold != NULL) { *m_hold = fs.m; vm_page_hold(fs.m); } vm_page_unlock(fs.m); vm_page_wakeup(fs.m); /* * Unlock everything, and return */ unlock_and_deallocate(&fs); if (hardfault) { PCPU_INC(cnt.v_io_faults); curthread->td_ru.ru_majflt++; } else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); }
/* * Routine: * vm_fault_copy_entry * Function: * Create new shadow object backing dst_entry with private copy of * all underlying pages. When src_entry is equal to dst_entry, * function implements COW for wired-down map entry. Otherwise, * it forks wired entry into dst_map. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, vm_map_entry_t dst_entry, vm_map_entry_t src_entry, vm_ooffset_t *fork_charge) { vm_object_t backing_object, dst_object, object, src_object; vm_pindex_t dst_pindex, pindex, src_pindex; vm_prot_t access, prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; boolean_t src_readonly, upgrade; #ifdef lint src_map++; #endif /* lint */ upgrade = src_entry == dst_entry; src_object = src_entry->object.vm_object; src_pindex = OFF_TO_IDX(src_entry->offset); src_readonly = (src_entry->protection & VM_PROT_WRITE) == 0; /* * Create the top-level object for the destination entry. (Doesn't * actually shadow anything - we copy the pages directly.) */ dst_object = vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(dst_entry->end - dst_entry->start)); #if VM_NRESERVLEVEL > 0 dst_object->flags |= OBJ_COLORED; dst_object->pg_color = atop(dst_entry->start); #endif VM_OBJECT_WLOCK(dst_object); KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; dst_object->charge = dst_entry->end - dst_entry->start; if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, ("vm_fault_copy_entry: leaked swp charge")); dst_object->cred = curthread->td_ucred; crhold(dst_object->cred); *fork_charge += dst_object->charge; } else { dst_object->cred = dst_entry->cred; dst_entry->cred = NULL; } access = prot = dst_entry->protection; /* * If not an upgrade, then enter the mappings in the pmap as * read and/or execute accesses. Otherwise, enter them as * write accesses. * * A writeable large page mapping is only created if all of * the constituent small page mappings are modified. Marking * PTEs as modified on inception allows promotion to happen * without taking potentially large number of soft faults. */ if (!upgrade) access &= ~VM_PROT_WRITE; /* * Loop through all of the virtual pages within the entry's * range, copying each page from the source object to the * destination object. Since the source is wired, those pages * must exist. In contrast, the destination is pageable. * Since the destination object does share any backing storage * with the source object, all of its pages must be dirtied, * regardless of whether they can be written. */ for (vaddr = dst_entry->start, dst_pindex = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_pindex++) { /* * Allocate a page in the destination object. */ do { dst_m = vm_page_alloc(dst_object, dst_pindex, VM_ALLOC_NORMAL); if (dst_m == NULL) { VM_OBJECT_WUNLOCK(dst_object); VM_WAIT; VM_OBJECT_WLOCK(dst_object); } } while (dst_m == NULL); /* * Find the page in the source object, and copy it in. * (Because the source is wired down, the page will be in * memory.) */ VM_OBJECT_WLOCK(src_object); object = src_object; pindex = src_pindex + dst_pindex; while ((src_m = vm_page_lookup(object, pindex)) == NULL && src_readonly && (backing_object = object->backing_object) != NULL) { /* * Allow fallback to backing objects if we are reading. */ VM_OBJECT_WLOCK(backing_object); pindex += OFF_TO_IDX(object->backing_object_offset); VM_OBJECT_WUNLOCK(object); object = backing_object; } if (src_m == NULL) panic("vm_fault_copy_wired: page missing"); pmap_copy_page(src_m, dst_m); VM_OBJECT_WUNLOCK(object); dst_m->valid = VM_PAGE_BITS_ALL; dst_m->dirty = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(dst_object); /* * Enter it in the pmap. If a wired, copy-on-write * mapping is being replaced by a write-enabled * mapping, then wire that new mapping. */ pmap_enter(dst_map->pmap, vaddr, access, dst_m, prot, upgrade); /* * Mark it no longer busy, and put it on the active list. */ VM_OBJECT_WLOCK(dst_object); if (upgrade) { vm_page_lock(src_m); vm_page_unwire(src_m, 0); vm_page_unlock(src_m); vm_page_lock(dst_m); vm_page_wire(dst_m); vm_page_unlock(dst_m); } else { vm_page_lock(dst_m); vm_page_activate(dst_m); vm_page_unlock(dst_m); } vm_page_wakeup(dst_m); } VM_OBJECT_WUNLOCK(dst_object); if (upgrade) { dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY); vm_object_deallocate(src_object); } }
/* * A VFS can call this function to try to dispose of a read request * directly from the VM system, pretty much bypassing almost all VFS * overhead except for atime updates. * * If 0 is returned some or all of the uio was handled. The caller must * check the uio and handle the remainder. * * The caller must fail on a non-zero error. */ int vop_helper_read_shortcut(struct vop_read_args *ap) { struct vnode *vp; struct uio *uio; struct lwbuf *lwb; struct lwbuf lwb_cache; vm_object_t obj; vm_page_t m; int offset; int n; int error; vp = ap->a_vp; uio = ap->a_uio; /* * We can't short-cut if there is no VM object or this is a special * UIO_NOCOPY read (typically from VOP_STRATEGY()). We also can't * do this if we cannot extract the filesize from the vnode. */ if (vm_read_shortcut_enable == 0) return(0); if (vp->v_object == NULL || uio->uio_segflg == UIO_NOCOPY) return(0); if (vp->v_filesize == NOOFFSET) return(0); if (uio->uio_resid == 0) return(0); /* * Iterate the uio on a page-by-page basis * * XXX can we leave the object held shared during the uiomove()? */ ++vm_read_shortcut_count; obj = vp->v_object; vm_object_hold_shared(obj); error = 0; while (uio->uio_resid && error == 0) { offset = (int)uio->uio_offset & PAGE_MASK; n = PAGE_SIZE - offset; if (n > uio->uio_resid) n = uio->uio_resid; if (vp->v_filesize < uio->uio_offset) break; if (uio->uio_offset + n > vp->v_filesize) n = vp->v_filesize - uio->uio_offset; if (n == 0) break; /* hit EOF */ m = vm_page_lookup_busy_try(obj, OFF_TO_IDX(uio->uio_offset), FALSE, &error); if (error || m == NULL) { ++vm_read_shortcut_failed; error = 0; break; } if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { ++vm_read_shortcut_failed; vm_page_wakeup(m); break; } lwb = lwbuf_alloc(m, &lwb_cache); /* * Use a no-fault uiomove() to avoid deadlocking against * our VM object (which could livelock on the same object * due to shared-vs-exclusive), or deadlocking against * our busied page. Returns EFAULT on any fault which * winds up diving a vnode. */ error = uiomove_nofault((char *)lwbuf_kva(lwb) + offset, n, uio); vm_page_flag_set(m, PG_REFERENCED); lwbuf_free(lwb); vm_page_wakeup(m); } vm_object_drop(obj); /* * Ignore EFAULT since we used uiomove_nofault(), causes caller * to fall-back to normal code for this case. */ if (error == EFAULT) error = 0; return (error); }
static int shm_dotruncate(struct shmfd *shmfd, off_t length) { vm_object_t object; vm_page_t m, ma[1]; vm_pindex_t idx, nobjsize; vm_ooffset_t delta; int base, rv; object = shmfd->shm_object; VM_OBJECT_LOCK(object); if (length == shmfd->shm_size) { VM_OBJECT_UNLOCK(object); return (0); } nobjsize = OFF_TO_IDX(length + PAGE_MASK); /* Are we shrinking? If so, trim the end. */ if (length < shmfd->shm_size) { /* * Disallow any requests to shrink the size if this * object is mapped into the kernel. */ if (shmfd->shm_kmappings > 0) { VM_OBJECT_UNLOCK(object); return (EBUSY); } /* * Zero the truncated part of the last page. */ base = length & PAGE_MASK; if (base != 0) { idx = OFF_TO_IDX(length); retry: m = vm_page_lookup(object, idx); if (m != NULL) { if ((m->oflags & VPO_BUSY) != 0 || m->busy != 0) { vm_page_sleep(m, "shmtrc"); goto retry; } } else if (vm_pager_has_page(object, idx, NULL, NULL)) { m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL); if (m == NULL) { VM_OBJECT_UNLOCK(object); VM_WAIT; VM_OBJECT_LOCK(object); goto retry; } else if (m->valid != VM_PAGE_BITS_ALL) { ma[0] = m; rv = vm_pager_get_pages(object, ma, 1, 0); m = vm_page_lookup(object, idx); } else /* A cached page was reactivated. */ rv = VM_PAGER_OK; vm_page_lock(m); if (rv == VM_PAGER_OK) { vm_page_deactivate(m); vm_page_unlock(m); vm_page_wakeup(m); } else { vm_page_free(m); vm_page_unlock(m); VM_OBJECT_UNLOCK(object); return (EIO); } } if (m != NULL) { pmap_zero_page_area(m, base, PAGE_SIZE - base); KASSERT(m->valid == VM_PAGE_BITS_ALL, ("shm_dotruncate: page %p is invalid", m)); vm_page_dirty(m); vm_pager_page_unswapped(m); } } delta = ptoa(object->size - nobjsize); /* Toss in memory pages. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* Toss pages from swap. */ if (object->type == OBJT_SWAP) swap_pager_freespace(object, nobjsize, delta); /* Free the swap accounted for shm */ swap_release_by_cred(delta, object->cred); object->charge -= delta; } else { /* Attempt to reserve the swap */ delta = ptoa(nobjsize - object->size); if (!swap_reserve_by_cred(delta, object->cred)) { VM_OBJECT_UNLOCK(object); return (ENOMEM); } object->charge += delta; } shmfd->shm_size = length; mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_ctime); shmfd->shm_mtime = shmfd->shm_ctime; mtx_unlock(&shm_timestamp_lock); object->size = nobjsize; VM_OBJECT_UNLOCK(object); return (0); }
static int tmpfs_mappedread(vm_object_t vobj, vm_object_t tobj, size_t len, struct uio *uio) { struct sf_buf *sf; vm_pindex_t idx; vm_page_t m; vm_offset_t offset; off_t addr; size_t tlen; char *ma; int error; addr = uio->uio_offset; idx = OFF_TO_IDX(addr); offset = addr & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); if ((vobj == NULL) || (vobj->resident_page_count == 0 && vobj->cache == NULL)) goto nocache; VM_OBJECT_LOCK(vobj); lookupvpg: if (((m = vm_page_lookup(vobj, idx)) != NULL) && vm_page_is_valid(m, offset, tlen)) { if ((m->oflags & VPO_BUSY) != 0) { /* * Reference the page before unlocking and sleeping so * that the page daemon is less likely to reclaim it. */ vm_page_reference(m); vm_page_sleep(m, "tmfsmr"); goto lookupvpg; } vm_page_busy(m); VM_OBJECT_UNLOCK(vobj); error = uiomove_fromphys(&m, offset, tlen, uio); VM_OBJECT_LOCK(vobj); vm_page_wakeup(m); VM_OBJECT_UNLOCK(vobj); return (error); } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) { KASSERT(offset == 0, ("unexpected offset in tmpfs_mappedread for sendfile")); if ((m->oflags & VPO_BUSY) != 0) { /* * Reference the page before unlocking and sleeping so * that the page daemon is less likely to reclaim it. */ vm_page_reference(m); vm_page_sleep(m, "tmfsmr"); goto lookupvpg; } vm_page_busy(m); VM_OBJECT_UNLOCK(vobj); sched_pin(); sf = sf_buf_alloc(m, SFB_CPUPRIVATE); ma = (char *)sf_buf_kva(sf); error = tmpfs_nocacheread_buf(tobj, idx, 0, tlen, ma); if (error == 0) { if (tlen != PAGE_SIZE) bzero(ma + tlen, PAGE_SIZE - tlen); uio->uio_offset += tlen; uio->uio_resid -= tlen; } sf_buf_free(sf); sched_unpin(); VM_OBJECT_LOCK(vobj); if (error == 0) m->valid = VM_PAGE_BITS_ALL; vm_page_wakeup(m); VM_OBJECT_UNLOCK(vobj); return (error); } VM_OBJECT_UNLOCK(vobj); nocache: error = tmpfs_nocacheread(tobj, idx, offset, tlen, uio); return (error); }
static int tmpfs_mappedwrite(vm_object_t vobj, vm_object_t tobj, size_t len, struct uio *uio) { vm_pindex_t idx; vm_page_t vpg, tpg; vm_offset_t offset; off_t addr; size_t tlen; int error, rv; error = 0; addr = uio->uio_offset; idx = OFF_TO_IDX(addr); offset = addr & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); if ((vobj == NULL) || (vobj->resident_page_count == 0 && vobj->cache == NULL)) { vpg = NULL; goto nocache; } VM_OBJECT_LOCK(vobj); lookupvpg: if (((vpg = vm_page_lookup(vobj, idx)) != NULL) && vm_page_is_valid(vpg, offset, tlen)) { if ((vpg->oflags & VPO_BUSY) != 0) { /* * Reference the page before unlocking and sleeping so * that the page daemon is less likely to reclaim it. */ vm_page_reference(vpg); vm_page_sleep(vpg, "tmfsmw"); goto lookupvpg; } vm_page_busy(vpg); vm_page_undirty(vpg); VM_OBJECT_UNLOCK(vobj); error = uiomove_fromphys(&vpg, offset, tlen, uio); } else { if (__predict_false(vobj->cache != NULL)) vm_page_cache_free(vobj, idx, idx + 1); VM_OBJECT_UNLOCK(vobj); vpg = NULL; } nocache: VM_OBJECT_LOCK(tobj); tpg = vm_page_grab(tobj, idx, VM_ALLOC_WIRED | VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (tpg->valid != VM_PAGE_BITS_ALL) { if (vm_pager_has_page(tobj, idx, NULL, NULL)) { rv = vm_pager_get_pages(tobj, &tpg, 1, 0); if (rv != VM_PAGER_OK) { vm_page_lock(tpg); vm_page_free(tpg); vm_page_unlock(tpg); error = EIO; goto out; } } else vm_page_zero_invalid(tpg, TRUE); } VM_OBJECT_UNLOCK(tobj); if (vpg == NULL) error = uiomove_fromphys(&tpg, offset, tlen, uio); else { KASSERT(vpg->valid == VM_PAGE_BITS_ALL, ("parts of vpg invalid")); pmap_copy_page(vpg, tpg); } VM_OBJECT_LOCK(tobj); if (error == 0) { KASSERT(tpg->valid == VM_PAGE_BITS_ALL, ("parts of tpg invalid")); vm_page_dirty(tpg); } vm_page_lock(tpg); vm_page_unwire(tpg, TRUE); vm_page_unlock(tpg); vm_page_wakeup(tpg); out: VM_OBJECT_UNLOCK(tobj); if (vpg != NULL) { VM_OBJECT_LOCK(vobj); vm_page_wakeup(vpg); VM_OBJECT_UNLOCK(vobj); } return (error); }
/* * vm_contig_pg_alloc: * * Allocate contiguous pages from the VM. This function does not * map the allocated pages into the kernel map, otherwise it is * impossible to make large allocations (i.e. >2G). * * Malloc()'s data structures have been used for collection of * statistics and for allocations of less than a page. */ static int vm_contig_pg_alloc(unsigned long size, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, unsigned long boundary, int mflags) { int i, q, start, pass; vm_offset_t phys; vm_page_t pga = vm_page_array; vm_page_t m; int pqtype; size = round_page(size); if (size == 0) panic("vm_contig_pg_alloc: size must not be 0"); if ((alignment & (alignment - 1)) != 0) panic("vm_contig_pg_alloc: alignment must be a power of 2"); if ((boundary & (boundary - 1)) != 0) panic("vm_contig_pg_alloc: boundary must be a power of 2"); /* * See if we can get the pages from the contiguous page reserve * alist. The returned pages will be allocated and wired but not * busied. */ m = vm_page_alloc_contig(low, high, alignment, boundary, size); if (m) return (m - &pga[0]); /* * Three passes (0, 1, 2). Each pass scans the VM page list for * free or cached pages. After each pass if the entire scan failed * we attempt to flush inactive pages and reset the start index back * to 0. For passes 1 and 2 we also attempt to flush active pages. */ start = 0; for (pass = 0; pass < 3; pass++) { /* * Find first page in array that is free, within range, * aligned, and such that the boundary won't be crossed. */ again: for (i = start; i < vmstats.v_page_count; i++) { m = &pga[i]; phys = VM_PAGE_TO_PHYS(m); pqtype = m->queue - m->pc; if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) && (phys >= low) && (phys < high) && ((phys & (alignment - 1)) == 0) && (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0) && m->busy == 0 && m->wire_count == 0 && m->hold_count == 0 && (m->flags & (PG_BUSY | PG_NEED_COMMIT)) == 0) { break; } } /* * If we cannot find the page in the given range, or we have * crossed the boundary, call the vm_contig_pg_clean() function * for flushing out the queues, and returning it back to * normal state. */ if ((i == vmstats.v_page_count) || ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) { /* * Best effort flush of all inactive pages. * This is quite quick, for now stall all * callers, even if they've specified M_NOWAIT. */ for (q = 0; q < PQ_L2_SIZE; ++q) { vm_contig_pg_clean(PQ_INACTIVE + q, vmstats.v_inactive_count); lwkt_yield(); } /* * Best effort flush of active pages. * * This is very, very slow. * Only do this if the caller has agreed to M_WAITOK. * * If enough pages are flushed, we may succeed on * next (final) pass, if not the caller, contigmalloc(), * will fail in the index < 0 case. */ if (pass > 0 && (mflags & M_WAITOK)) { for (q = 0; q < PQ_L2_SIZE; ++q) { vm_contig_pg_clean(PQ_ACTIVE + q, vmstats.v_active_count); } lwkt_yield(); } /* * We're already too high in the address space * to succeed, reset to 0 for the next iteration. */ start = 0; continue; /* next pass */ } start = i; /* * Check successive pages for contiguous and free. * * (still in critical section) */ for (i = start + 1; i < (start + size / PAGE_SIZE); i++) { m = &pga[i]; pqtype = m->queue - m->pc; if ((VM_PAGE_TO_PHYS(&m[0]) != (VM_PAGE_TO_PHYS(&m[-1]) + PAGE_SIZE)) || ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE)) || m->busy || m->wire_count || m->hold_count || (m->flags & (PG_BUSY | PG_NEED_COMMIT))) { start++; goto again; } } /* * Try to allocate the pages, wiring them as we go. * * (still in critical section) */ for (i = start; i < (start + size / PAGE_SIZE); i++) { m = &pga[i]; if (vm_page_busy_try(m, TRUE)) { vm_contig_pg_free(start, (i - start) * PAGE_SIZE); start++; goto again; } pqtype = m->queue - m->pc; if (pqtype == PQ_CACHE && m->hold_count == 0 && m->wire_count == 0 && (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) == 0) { vm_page_protect(m, VM_PROT_NONE); KKASSERT((m->flags & PG_MAPPED) == 0); KKASSERT(m->dirty == 0); vm_page_free(m); --i; continue; /* retry the page */ } if (pqtype != PQ_FREE || m->hold_count) { vm_page_wakeup(m); vm_contig_pg_free(start, (i - start) * PAGE_SIZE); start++; goto again; } KKASSERT((m->valid & m->dirty) == 0); KKASSERT(m->wire_count == 0); KKASSERT(m->object == NULL); vm_page_unqueue_nowakeup(m); m->valid = VM_PAGE_BITS_ALL; if (m->flags & PG_ZERO) vm_page_zero_count--; KASSERT(m->dirty == 0, ("vm_contig_pg_alloc: page %p was dirty", m)); KKASSERT(m->wire_count == 0); KKASSERT(m->busy == 0); /* * Clear all flags except PG_BUSY, PG_ZERO, and * PG_WANTED, then unbusy the now allocated page. */ vm_page_flag_clear(m, ~(PG_BUSY | PG_SBUSY | PG_ZERO | PG_WANTED)); vm_page_wire(m); vm_page_wakeup(m); } /* * Our job is done, return the index page of vm_page_array. */ return (start); /* aka &pga[start] */ } /* * Failed. */ return (-1); }
/* * vm_contig_pg_clean: * * Do a thorough cleanup of the specified 'queue', which can be either * PQ_ACTIVE or PQ_INACTIVE by doing a walkthrough. If the page is not * marked dirty, it is shoved into the page cache, provided no one has * currently aqcuired it, otherwise localized action per object type * is taken for cleanup: * * In the OBJT_VNODE case, the whole page range is cleaned up * using the vm_object_page_clean() routine, by specyfing a * start and end of '0'. * * Otherwise if the object is of any other type, the generic * pageout (daemon) flush routine is invoked. */ static void vm_contig_pg_clean(int queue, int count) { vm_object_t object; vm_page_t m, m_tmp; struct vm_page marker; struct vpgqueues *pq = &vm_page_queues[queue]; /* * Setup a local marker */ bzero(&marker, sizeof(marker)); marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; marker.queue = queue; marker.wire_count = 1; vm_page_queues_spin_lock(queue); TAILQ_INSERT_HEAD(&pq->pl, &marker, pageq); vm_page_queues_spin_unlock(queue); /* * Iterate the queue. Note that the vm_page spinlock must be * acquired before the pageq spinlock so it's easiest to simply * not hold it in the loop iteration. */ while (count-- > 0 && (m = TAILQ_NEXT(&marker, pageq)) != NULL) { vm_page_and_queue_spin_lock(m); if (m != TAILQ_NEXT(&marker, pageq)) { vm_page_and_queue_spin_unlock(m); ++count; continue; } KKASSERT(m->queue == queue); TAILQ_REMOVE(&pq->pl, &marker, pageq); TAILQ_INSERT_AFTER(&pq->pl, m, &marker, pageq); if (m->flags & PG_MARKER) { vm_page_and_queue_spin_unlock(m); continue; } if (vm_page_busy_try(m, TRUE)) { vm_page_and_queue_spin_unlock(m); continue; } vm_page_and_queue_spin_unlock(m); /* * We've successfully busied the page */ if (m->queue - m->pc != queue) { vm_page_wakeup(m); continue; } if (m->wire_count || m->hold_count) { vm_page_wakeup(m); continue; } if ((object = m->object) == NULL) { vm_page_wakeup(m); continue; } vm_page_test_dirty(m); if (m->dirty || (m->flags & PG_NEED_COMMIT)) { vm_object_hold(object); KKASSERT(m->object == object); if (object->type == OBJT_VNODE) { vm_page_wakeup(m); vn_lock(object->handle, LK_EXCLUSIVE|LK_RETRY); vm_object_page_clean(object, 0, 0, OBJPC_SYNC); vn_unlock(((struct vnode *)object->handle)); } else if (object->type == OBJT_SWAP || object->type == OBJT_DEFAULT) { m_tmp = m; vm_pageout_flush(&m_tmp, 1, 0); } else { vm_page_wakeup(m); } vm_object_drop(object); } else if (m->hold_count == 0) { vm_page_cache(m); } else { vm_page_wakeup(m); } } /* * Scrap our local marker */ vm_page_queues_spin_lock(queue); TAILQ_REMOVE(&pq->pl, &marker, pageq); vm_page_queues_spin_unlock(queue); }
/* * Lets the VM system know about a change in size for a file. * We adjust our own internal size and flush any cached pages in * the associated object that are affected by the size change. * * NOTE: This routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. * * NOTE: vp->v_filesize is initialized to NOOFFSET (-1), be sure that * we do not blow up on the case. nsize will always be >= 0, however. */ void vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) { vm_pindex_t nobjsize; vm_pindex_t oobjsize; vm_object_t object; object = vp->v_object; if (object == NULL) return; vm_object_hold(object); KKASSERT(vp->v_object == object); /* * Hasn't changed size */ if (nsize == vp->v_filesize) { vm_object_drop(object); return; } /* * Has changed size. Adjust the VM object's size and v_filesize * before we start scanning pages to prevent new pages from being * allocated during the scan. */ nobjsize = OFF_TO_IDX(nsize + PAGE_MASK); oobjsize = object->size; object->size = nobjsize; /* * File has shrunk. Toss any cached pages beyond the new EOF. */ if (nsize < vp->v_filesize) { vp->v_filesize = nsize; if (nobjsize < oobjsize) { vm_object_page_remove(object, nobjsize, oobjsize, FALSE); } /* * This gets rid of garbage at the end of a page that is now * only partially backed by the vnode. Since we are setting * the entire page valid & clean after we are done we have * to be sure that the portion of the page within the file * bounds is already valid. If it isn't then making it * valid would create a corrupt block. */ if (nsize & PAGE_MASK) { vm_offset_t kva; vm_page_t m; m = vm_page_lookup_busy_wait(object, OFF_TO_IDX(nsize), TRUE, "vsetsz"); if (m && m->valid) { int base = (int)nsize & PAGE_MASK; int size = PAGE_SIZE - base; struct lwbuf *lwb; struct lwbuf lwb_cache; /* * Clear out partial-page garbage in case * the page has been mapped. * * This is byte aligned. */ lwb = lwbuf_alloc(m, &lwb_cache); kva = lwbuf_kva(lwb); bzero((caddr_t)kva + base, size); lwbuf_free(lwb); /* * XXX work around SMP data integrity race * by unmapping the page from user processes. * The garbage we just cleared may be mapped * to a user process running on another cpu * and this code is not running through normal * I/O channels which handle SMP issues for * us, so unmap page to synchronize all cpus. * * XXX should vm_pager_unmap_page() have * dealt with this? */ vm_page_protect(m, VM_PROT_NONE); /* * Clear out partial-page dirty bits. This * has the side effect of setting the valid * bits, but that is ok. There are a bunch * of places in the VM system where we expected * m->dirty == VM_PAGE_BITS_ALL. The file EOF * case is one of them. If the page is still * partially dirty, make it fully dirty. * * NOTE: We do not clear out the valid * bits. This would prevent bogus_page * replacement from working properly. * * NOTE: We do not want to clear the dirty * bit for a partial DEV_BSIZE'd truncation! * This is DEV_BSIZE aligned! */ vm_page_clear_dirty_beg_nonincl(m, base, size); if (m->dirty != 0) m->dirty = VM_PAGE_BITS_ALL; vm_page_wakeup(m); } else if (m) { vm_page_wakeup(m); } } } else { vp->v_filesize = nsize; } vm_object_drop(object); }
/* * spec_getpages() - get pages associated with device vnode. * * Note that spec_read and spec_write do not use the buffer cache, so we * must fully implement getpages here. */ static int devfs_spec_getpages(struct vop_getpages_args *ap) { vm_offset_t kva; int error; int i, pcount, size; struct buf *bp; vm_page_t m; vm_ooffset_t offset; int toff, nextoff, nread; struct vnode *vp = ap->a_vp; int blksiz; int gotreqpage; error = 0; pcount = round_page(ap->a_count) / PAGE_SIZE; /* * Calculate the offset of the transfer and do sanity check. */ offset = IDX_TO_OFF(ap->a_m[0]->pindex) + ap->a_offset; /* * Round up physical size for real devices. We cannot round using * v_mount's block size data because v_mount has nothing to do with * the device. i.e. it's usually '/dev'. We need the physical block * size for the device itself. * * We can't use v_rdev->si_mountpoint because it only exists when the * block device is mounted. However, we can use v_rdev. */ if (vn_isdisk(vp, NULL)) blksiz = vp->v_rdev->si_bsize_phys; else blksiz = DEV_BSIZE; size = (ap->a_count + blksiz - 1) & ~(blksiz - 1); bp = getpbuf_kva(NULL); kva = (vm_offset_t)bp->b_data; /* * Map the pages to be read into the kva. */ pmap_qenter(kva, ap->a_m, pcount); /* Build a minimal buffer header. */ bp->b_cmd = BUF_CMD_READ; bp->b_bcount = size; bp->b_resid = 0; bsetrunningbufspace(bp, size); bp->b_bio1.bio_offset = offset; bp->b_bio1.bio_done = devfs_spec_getpages_iodone; mycpu->gd_cnt.v_vnodein++; mycpu->gd_cnt.v_vnodepgsin += pcount; /* Do the input. */ vn_strategy(ap->a_vp, &bp->b_bio1); crit_enter(); /* We definitely need to be at splbio here. */ while (bp->b_cmd != BUF_CMD_DONE) tsleep(bp, 0, "spread", 0); crit_exit(); if (bp->b_flags & B_ERROR) { if (bp->b_error) error = bp->b_error; else error = EIO; } /* * If EOF is encountered we must zero-extend the result in order * to ensure that the page does not contain garabge. When no * error occurs, an early EOF is indicated if b_bcount got truncated. * b_resid is relative to b_bcount and should be 0, but some devices * might indicate an EOF with b_resid instead of truncating b_bcount. */ nread = bp->b_bcount - bp->b_resid; if (nread < ap->a_count) bzero((caddr_t)kva + nread, ap->a_count - nread); pmap_qremove(kva, pcount); gotreqpage = 0; for (i = 0, toff = 0; i < pcount; i++, toff = nextoff) { nextoff = toff + PAGE_SIZE; m = ap->a_m[i]; m->flags &= ~PG_ZERO; /* * NOTE: vm_page_undirty/clear_dirty etc do not clear the * pmap modified bit. pmap modified bit should have * already been cleared. */ if (nextoff <= nread) { m->valid = VM_PAGE_BITS_ALL; vm_page_undirty(m); } else if (toff < nread) { /* * Since this is a VM request, we have to supply the * unaligned offset to allow vm_page_set_valid() * to zero sub-DEV_BSIZE'd portions of the page. */ vm_page_set_valid(m, 0, nread - toff); vm_page_clear_dirty_end_nonincl(m, 0, nread - toff); } else { m->valid = 0; vm_page_undirty(m); } if (i != ap->a_reqpage) { /* * Just in case someone was asking for this page we * now tell them that it is ok to use. */ if (!error || (m->valid == VM_PAGE_BITS_ALL)) { if (m->valid) { if (m->flags & PG_REFERENCED) { vm_page_activate(m); } else { vm_page_deactivate(m); } vm_page_wakeup(m); } else { vm_page_free(m); } } else { vm_page_free(m); } } else if (m->valid) { gotreqpage = 1; /* * Since this is a VM request, we need to make the * entire page presentable by zeroing invalid sections. */ if (m->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(m, FALSE); } } if (!gotreqpage) { m = ap->a_m[ap->a_reqpage]; devfs_debug(DEVFS_DEBUG_WARNING, "spec_getpages:(%s) I/O read failure: (error=%d) bp %p vp %p\n", devtoname(vp->v_rdev), error, bp, bp->b_vp); devfs_debug(DEVFS_DEBUG_WARNING, " size: %d, resid: %d, a_count: %d, valid: 0x%x\n", size, bp->b_resid, ap->a_count, m->valid); devfs_debug(DEVFS_DEBUG_WARNING, " nread: %d, reqpage: %d, pindex: %lu, pcount: %d\n", nread, ap->a_reqpage, (u_long)m->pindex, pcount); /* * Free the buffer header back to the swap buffer pool. */ relpbuf(bp, NULL); return VM_PAGER_ERROR; } /* * Free the buffer header back to the swap buffer pool. */ relpbuf(bp, NULL); if (DEVFS_NODE(ap->a_vp)) nanotime(&DEVFS_NODE(ap->a_vp)->mtime); return VM_PAGER_OK; }
/* * Vnode op for VM getpages. * Wish wish .... get rid from multiple IO routines * * smbfs_getpages(struct vnode *a_vp, vm_page_t *a_m, int a_count, * int a_reqpage, vm_ooffset_t a_offset) */ int smbfs_getpages(struct vop_getpages_args *ap) { #ifdef SMBFS_RWGENERIC return vop_stdgetpages(ap); #else int i, error, npages; int doclose; size_t size, toff, nextoff, count; struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; struct vnode *vp; struct thread *td = curthread; /* XXX */ struct ucred *cred; struct smbmount *smp; struct smbnode *np; struct smb_cred scred; vm_page_t *pages; KKASSERT(td->td_proc); vp = ap->a_vp; cred = td->td_proc->p_ucred; np = VTOSMB(vp); smp = VFSTOSMBFS(vp->v_mount); pages = ap->a_m; count = (size_t)ap->a_count; if (vp->v_object == NULL) { kprintf("smbfs_getpages: called with non-merged cache vnode??\n"); return VM_PAGER_ERROR; } smb_makescred(&scred, td, cred); bp = getpbuf_kva(&smbfs_pbuf_freecnt); npages = btoc(count); kva = (vm_offset_t) bp->b_data; pmap_qenter(kva, pages, npages); iov.iov_base = (caddr_t) kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; /* * This is kinda nasty. Since smbfs is physically closing the * fid on close(), we have to reopen it if necessary. There are * other races here too, such as if another process opens the same * file while we are blocked in read. XXX */ error = 0; doclose = 0; if (np->n_opencount == 0) { error = smbfs_smb_open(np, SMB_AM_OPENREAD, &scred); if (error == 0) doclose = 1; } if (error == 0) error = smb_read(smp->sm_share, np->n_fid, &uio, &scred); if (doclose) smbfs_smb_close(smp->sm_share, np->n_fid, NULL, &scred); pmap_qremove(kva, npages); relpbuf(bp, &smbfs_pbuf_freecnt); if (error && (uio.uio_resid == count)) { kprintf("smbfs_getpages: error %d\n",error); for (i = 0; i < npages; i++) { if (ap->a_reqpage != i) vnode_pager_freepage(pages[i]); } return VM_PAGER_ERROR; } size = count - uio.uio_resid; for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { vm_page_t m; nextoff = toff + PAGE_SIZE; m = pages[i]; m->flags &= ~PG_ZERO; /* * NOTE: pmap dirty bit should have already been cleared. * We do not clear it here. */ if (nextoff <= size) { m->valid = VM_PAGE_BITS_ALL; m->dirty = 0; } else { int nvalid = ((size + DEV_BSIZE - 1) - toff) & ~(DEV_BSIZE - 1); vm_page_set_validclean(m, 0, nvalid); } if (i != ap->a_reqpage) { /* * Whether or not to leave the page activated is up in * the air, but we should put the page on a page queue * somewhere (it already is in the object). Result: * It appears that emperical results show that * deactivating pages is best. */ /* * Just in case someone was asking for this page we * now tell them that it is ok to use. */ if (!error) { if (m->flags & PG_REFERENCED) vm_page_activate(m); else vm_page_deactivate(m); vm_page_wakeup(m); } else { vnode_pager_freepage(m); } } } return 0; #endif /* SMBFS_RWGENERIC */ }
/* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_GETPAGES. * * With all the caching local media devices do these days there is really * very little point to attempting to restrict the I/O size to contiguous * blocks on-disk, especially if our caller thinks we need all the specified * pages. Just construct and issue a READ. */ int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *mpp, int bytecount, int reqpage, int seqaccess) { struct iovec aiov; struct uio auio; off_t foff; int error; int count; int i; int ioflags; /* * Do not do anything if the vnode is bad. */ if (vp->v_mount == NULL) return VM_PAGER_BAD; /* * Calculate the number of pages. Since we are paging in whole * pages, adjust bytecount to be an integral multiple of the page * size. It will be clipped to the file EOF later on. */ bytecount = round_page(bytecount); count = bytecount / PAGE_SIZE; /* * We could check m[reqpage]->valid here and shortcut the operation, * but doing so breaks read-ahead. Instead assume that the VM * system has already done at least the check, don't worry about * any races, and issue the VOP_READ to allow read-ahead to function. * * This keeps the pipeline full for I/O bound sequentially scanned * mmap()'s */ /* don't shortcut */ /* * Discard pages past the file EOF. If the requested page is past * the file EOF we just leave its valid bits set to 0, the caller * expects to maintain ownership of the requested page. If the * entire range is past file EOF discard everything and generate * a pagein error. */ foff = IDX_TO_OFF(mpp[0]->pindex); if (foff >= vp->v_filesize) { for (i = 0; i < count; i++) { if (i != reqpage) vnode_pager_freepage(mpp[i]); } return VM_PAGER_ERROR; } if (foff + bytecount > vp->v_filesize) { bytecount = vp->v_filesize - foff; i = round_page(bytecount) / PAGE_SIZE; while (count > i) { --count; if (count != reqpage) vnode_pager_freepage(mpp[count]); } } /* * The size of the transfer is bytecount. bytecount will be an * integral multiple of the page size unless it has been clipped * to the file EOF. The transfer cannot exceed the file EOF. * * When dealing with real devices we must round-up to the device * sector size. */ if (vp->v_type == VBLK || vp->v_type == VCHR) { int secmask = vp->v_rdev->si_bsize_phys - 1; KASSERT(secmask < PAGE_SIZE, ("vnode_pager_generic_getpages: sector size %d too large", secmask + 1)); bytecount = (bytecount + secmask) & ~secmask; } /* * Severe hack to avoid deadlocks with the buffer cache */ for (i = 0; i < count; ++i) { vm_page_t mt = mpp[i]; vm_page_io_start(mt); vm_page_wakeup(mt); } /* * Issue the I/O with some read-ahead if bytecount > PAGE_SIZE */ ioflags = IO_VMIO; if (seqaccess) ioflags |= IO_SEQMAX << IO_SEQSHIFT; aiov.iov_base = NULL; aiov.iov_len = bytecount; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = foff; auio.uio_segflg = UIO_NOCOPY; auio.uio_rw = UIO_READ; auio.uio_resid = bytecount; auio.uio_td = NULL; mycpu->gd_cnt.v_vnodein++; mycpu->gd_cnt.v_vnodepgsin += count; error = VOP_READ(vp, &auio, ioflags, proc0.p_ucred); /* * Severe hack to avoid deadlocks with the buffer cache */ for (i = 0; i < count; ++i) { vm_page_busy_wait(mpp[i], FALSE, "getpgs"); vm_page_io_finish(mpp[i]); } /* * Calculate the actual number of bytes read and clean up the * page list. */ bytecount -= auio.uio_resid; for (i = 0; i < count; ++i) { vm_page_t mt = mpp[i]; if (i != reqpage) { if (error == 0 && mt->valid) { if (mt->flags & PG_REFERENCED) vm_page_activate(mt); else vm_page_deactivate(mt); vm_page_wakeup(mt); } else { vnode_pager_freepage(mt); } } else if (mt->valid == 0) { if (error == 0) { kprintf("page failed but no I/O error page " "%p object %p pindex %d\n", mt, mt->object, (int) mt->pindex); /* whoops, something happened */ error = EINVAL; } } else if (mt->valid != VM_PAGE_BITS_ALL) { /* * Zero-extend the requested page if necessary (if * the filesystem is using a small block size). */ vm_page_zero_invalid(mt, TRUE); } } if (error) { kprintf("vnode_pager_getpage: I/O read error\n"); } return (error ? VM_PAGER_ERROR : VM_PAGER_OK); }
/* struct vnop_getpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_reqpage; vm_ooffset_t a_offset; }; */ static int fuse_vnop_getpages(struct vop_getpages_args *ap) { int i, error, nextoff, size, toff, count, npages; struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; struct vnode *vp; struct thread *td; struct ucred *cred; vm_page_t *pages; FS_DEBUG2G("heh\n"); vp = ap->a_vp; KASSERT(vp->v_object, ("objectless vp passed to getpages")); td = curthread; /* XXX */ cred = curthread->td_ucred; /* XXX */ pages = ap->a_m; count = ap->a_count; if (!fsess_opt_mmap(vnode_mount(vp))) { FS_DEBUG("called on non-cacheable vnode??\n"); return (VM_PAGER_ERROR); } npages = btoc(count); /* * If the requested page is partially valid, just return it and * allow the pager to zero-out the blanks. Partially valid pages * can only occur at the file EOF. */ VM_OBJECT_LOCK(vp->v_object); fuse_vm_page_lock_queues(); if (pages[ap->a_reqpage]->valid != 0) { for (i = 0; i < npages; ++i) { if (i != ap->a_reqpage) { fuse_vm_page_lock(pages[i]); vm_page_free(pages[i]); fuse_vm_page_unlock(pages[i]); } } fuse_vm_page_unlock_queues(); VM_OBJECT_UNLOCK(vp->v_object); return 0; } fuse_vm_page_unlock_queues(); VM_OBJECT_UNLOCK(vp->v_object); /* * We use only the kva address for the buffer, but this is extremely * convienient and fast. */ bp = getpbuf(&fuse_pbuf_freecnt); kva = (vm_offset_t)bp->b_data; pmap_qenter(kva, pages, npages); PCPU_INC(cnt.v_vnodein); PCPU_ADD(cnt.v_vnodepgsin, npages); iov.iov_base = (caddr_t)kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); pmap_qremove(kva, npages); relpbuf(bp, &fuse_pbuf_freecnt); if (error && (uio.uio_resid == count)) { FS_DEBUG("error %d\n", error); VM_OBJECT_LOCK(vp->v_object); fuse_vm_page_lock_queues(); for (i = 0; i < npages; ++i) { if (i != ap->a_reqpage) { fuse_vm_page_lock(pages[i]); vm_page_free(pages[i]); fuse_vm_page_unlock(pages[i]); } } fuse_vm_page_unlock_queues(); VM_OBJECT_UNLOCK(vp->v_object); return VM_PAGER_ERROR; } /* * Calculate the number of bytes read and validate only that number * of bytes. Note that due to pending writes, size may be 0. This * does not mean that the remaining data is invalid! */ size = count - uio.uio_resid; VM_OBJECT_LOCK(vp->v_object); fuse_vm_page_lock_queues(); for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { vm_page_t m; nextoff = toff + PAGE_SIZE; m = pages[i]; if (nextoff <= size) { /* * Read operation filled an entire page */ m->valid = VM_PAGE_BITS_ALL; KASSERT(m->dirty == 0, ("fuse_getpages: page %p is dirty", m)); } else if (size > toff) { /* * Read operation filled a partial page. */ m->valid = 0; vm_page_set_valid_range(m, 0, size - toff); KASSERT(m->dirty == 0, ("fuse_getpages: page %p is dirty", m)); } else { /* * Read operation was short. If no error occured * we may have hit a zero-fill section. We simply * leave valid set to 0. */ ; } if (i != ap->a_reqpage) { /* * Whether or not to leave the page activated is up in * the air, but we should put the page on a page queue * somewhere (it already is in the object). Result: * It appears that emperical results show that * deactivating pages is best. */ /* * Just in case someone was asking for this page we * now tell them that it is ok to use. */ if (!error) { if (m->oflags & VPO_WANTED) { fuse_vm_page_lock(m); vm_page_activate(m); fuse_vm_page_unlock(m); } else { fuse_vm_page_lock(m); vm_page_deactivate(m); fuse_vm_page_unlock(m); } vm_page_wakeup(m); } else { fuse_vm_page_lock(m); vm_page_free(m); fuse_vm_page_unlock(m); } } } fuse_vm_page_unlock_queues(); VM_OBJECT_UNLOCK(vp->v_object); return 0; }