vm_page_t shmem_read_mapping_page(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; int rv; VM_OBJECT_LOCK_ASSERT_OWNED(object); m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (m->valid != VM_PAGE_BITS_ALL) { if (vm_pager_has_page(object, pindex)) { rv = vm_pager_get_page(object, &m, 1); m = vm_page_lookup(object, pindex); if (m == NULL) return ERR_PTR(-ENOMEM); if (rv != VM_PAGER_OK) { vm_page_free(m); return ERR_PTR(-ENOMEM); } } else { pmap_zero_page(VM_PAGE_TO_PHYS(m)); m->valid = VM_PAGE_BITS_ALL; m->dirty = 0; } } vm_page_wire(m); vm_page_wakeup(m); return (m); }
/* * Obtain a page pointer array and lock all pages into system memory. A segmentation violation will * occur here if the calling user does not have access to the submitted address. */ static int via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer) { unsigned long first_pfn = VIA_PFN(xfer->mem_addr); vm_page_t m; int i; vsg->num_pages = VIA_PFN(xfer->mem_addr + (xfer->num_lines * xfer->mem_stride -1)) - first_pfn + 1; if (NULL == (vsg->pages = malloc(sizeof(vm_page_t) * vsg->num_pages, DRM_MEM_DRIVER, M_NOWAIT))) return -ENOMEM; vsg->state = dr_via_pages_alloc; if (vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)xfer->mem_addr, vsg->num_pages * PAGE_SIZE, VM_PROT_READ | VM_PROT_WRITE, vsg->pages, vsg->num_pages) < 0) return -EACCES; for (i = 0; i < vsg->num_pages; i++) { m = vsg->pages[i]; vm_page_lock(m); vm_page_wire(m); vm_page_unhold(m); vm_page_unlock(m); } vsg->state = dr_via_pages_locked; DRM_DEBUG("DMA pages locked\n"); return 0; }
/* * Allocate new wired pages in an object. * The object is assumed to be mapped into the kernel map or * a submap. */ void kmem_alloc_pages( vm_object_t object, vm_offset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t protection) { /* * Mark the pmap region as not pageable. */ pmap_pageable(kernel_pmap, start, end, FALSE); while (start < end) { vm_page_t mem; vm_object_lock(object); /* * Allocate a page */ while ((mem = vm_page_alloc(object, offset)) == VM_PAGE_NULL) { vm_object_unlock(object); VM_PAGE_WAIT((void (*)()) 0); vm_object_lock(object); } /* * Wire it down */ vm_page_lock_queues(); vm_page_wire(mem); vm_page_unlock_queues(); vm_object_unlock(object); /* * Enter it in the kernel pmap */ PMAP_ENTER(kernel_pmap, start, mem, protection, TRUE); vm_object_lock(object); PAGE_WAKEUP_DONE(mem); vm_object_unlock(object); start += PAGE_SIZE; offset += PAGE_SIZE; } }
static void wire_ddp_buffer(struct ddp_buffer *db) { int i; vm_page_t p; for (i = 0; i < db->npages; i++) { p = db->pages[i]; vm_page_lock(p); vm_page_wire(p); vm_page_unhold(p); vm_page_unlock(p); } }
/* * Remap wired pages in an object into a new region. * The object is assumed to be mapped into the kernel map or * a submap. */ void kmem_remap_pages( vm_object_t object, vm_offset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t protection) { /* * Mark the pmap region as not pageable. */ pmap_pageable(kernel_pmap, start, end, FALSE); while (start < end) { vm_page_t mem; vm_object_lock(object); /* * Find a page */ if ((mem = vm_page_lookup(object, offset)) == VM_PAGE_NULL) panic("kmem_remap_pages"); /* * Wire it down (again) */ vm_page_lock_queues(); vm_page_wire(mem); vm_page_unlock_queues(); vm_object_unlock(object); /* * Enter it in the kernel pmap. The page isn't busy, * but this shouldn't be a problem because it is wired. */ PMAP_ENTER(kernel_pmap, start, mem, protection, TRUE); start += PAGE_SIZE; offset += PAGE_SIZE; } }
/* * Given a user pointer to a page of user memory, return an sf_buf for the * page. Because we may be requesting quite a few sf_bufs, prefer failure to * deadlock and use SFB_NOWAIT. */ static struct sf_buf * zbuf_sfbuf_get(struct vm_map *map, vm_offset_t uaddr) { struct sf_buf *sf; vm_page_t pp; if (vm_fault_quick_hold_pages(map, uaddr, PAGE_SIZE, VM_PROT_READ | VM_PROT_WRITE, &pp, 1) < 0) return (NULL); vm_page_lock(pp); vm_page_wire(pp); vm_page_unhold(pp); vm_page_unlock(pp); sf = sf_buf_alloc(pp, SFB_NOWAIT); if (sf == NULL) { zbuf_page_free(pp); return (NULL); } return (sf); }
int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { vm_prot_t prot; long ahead, behind; int alloc_req, era, faultcount, nera, reqpage, result; boolean_t growstack, is_first_object_locked, wired; int map_generation; vm_object_t next_object; vm_page_t marray[VM_FAULT_READ_MAX]; int hardfault; struct faultstate fs; struct vnode *vp; int locked, error; hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); fs.vp = NULL; faultcount = reqpage = 0; RetryFault:; /* * Find the backing store object and offset into it to begin the * search. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { if (growstack && result == KERN_INVALID_ADDRESS && map != kernel_map) { result = vm_map_growstack(curproc, vaddr); if (result != KERN_SUCCESS) return (KERN_FAILURE); growstack = FALSE; goto RetryFault; } return (result); } map_generation = fs.map->timestamp; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { panic("vm_fault: fault on nofault entry, addr: %lx", (u_long)vaddr); } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. This must be done after * obtaining the vnode lock in order to avoid possible deadlocks. */ VM_OBJECT_WLOCK(fs.first_object); vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.lookup_still_valid = TRUE; if (wired) fault_type = prot | (fault_type & VM_PROT_COPY); fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { /* * If the object is dead, we stop here */ if (fs.object->flags & OBJ_DEAD) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { /* * check for page-based copy on write. * We check fs.object == fs.first_object so * as to ensure the legacy COW mechanism is * used when the page in question is part of * a shadow object. Otherwise, vm_page_cowfault() * removes the page from the backing object, * which is not what we want. */ vm_page_lock(fs.m); if ((fs.m->cow) && (fault_type & VM_PROT_WRITE) && (fs.object == fs.first_object)) { vm_page_cowfault(fs.m); unlock_and_deallocate(&fs); goto RetryFault; } /* * Wait/Retry if the page is busy. We have to do this * if the page is busy via either VPO_BUSY or * vm_page_t->busy because the vm_pager may be using * vm_page_t->busy for pageouts ( and even pageins if * it is the vnode pager ), and we could end up trying * to pagein and pageout the same page simultaneously. * * We can theoretically allow the busy case on a read * fault if the page is marked valid, but since such * pages are typically already pmap'd, putting that * special case in might be more effort then it is * worth. We cannot under any circumstances mess * around with a vm_page_t->busy page except, perhaps, * to pmap it. */ if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs.m, PGA_REFERENCED); vm_page_unlock(fs.m); if (fs.object != fs.first_object) { if (!VM_OBJECT_TRYWLOCK( fs.first_object)) { VM_OBJECT_WUNLOCK(fs.object); VM_OBJECT_WLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.object); } vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); vm_object_pip_wakeup(fs.first_object); VM_OBJECT_WUNLOCK(fs.first_object); fs.first_m = NULL; } unlock_map(&fs); if (fs.m == vm_page_lookup(fs.object, fs.pindex)) { vm_page_sleep_if_busy(fs.m, TRUE, "vmpfw"); } vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); PCPU_INC(cnt.v_intrans); vm_object_deallocate(fs.first_object); goto RetryFault; } vm_page_remque(fs.m); vm_page_unlock(fs.m); /* * Mark page busy for other processes, and the * pagedaemon. If it still isn't completely valid * (readable), jump to readrest, else break-out ( we * found the page ). */ vm_page_busy(fs.m); if (fs.m->valid != VM_PAGE_BITS_ALL) goto readrest; break; } /* * Page is not resident, If this is the search termination * or the pager might contain the page, allocate a new page. */ if (TRYPAGER || fs.object == fs.first_object) { if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At * worst, the P_KILLED might be not observed * there, and allocation can fail, causing * restart and new reading of the p_flag. */ fs.m = NULL; if (!vm_page_count_severe() || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 if ((fs.object->flags & OBJ_COLORED) == 0) { fs.object->flags |= OBJ_COLORED; fs.object->pg_color = atop(vaddr) - fs.pindex; } #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs.object->type != OBJT_VNODE && fs.object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; fs.m = vm_page_alloc(fs.object, fs.pindex, alloc_req); } if (fs.m == NULL) { unlock_and_deallocate(&fs); VM_WAITPFAULT; goto RetryFault; } else if (fs.m->valid == VM_PAGE_BITS_ALL) break; } readrest: /* * We have found a valid page or we have allocated a new page. * The page thus may not be valid or may not be entirely * valid. * * Attempt to fault-in the page if there is a chance that the * pager has it, and potentially fault in additional pages * at the same time. */ if (TRYPAGER) { int rv; u_char behavior = vm_map_entry_behavior(fs.entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { behind = 0; ahead = atop(fs.entry->end - vaddr) - 1; if (ahead > VM_FAULT_READ_AHEAD_MAX) ahead = VM_FAULT_READ_AHEAD_MAX; if (fs.pindex == fs.entry->next_read) vm_fault_cache_behind(&fs, VM_FAULT_READ_MAX); } else { /* * If this is a sequential page fault, then * arithmetically increase the number of pages * in the read-ahead window. Otherwise, reset * the read-ahead window to its smallest size. */ behind = atop(vaddr - fs.entry->start); if (behind > VM_FAULT_READ_BEHIND) behind = VM_FAULT_READ_BEHIND; ahead = atop(fs.entry->end - vaddr) - 1; era = fs.entry->read_ahead; if (fs.pindex == fs.entry->next_read) { nera = era + behind; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; behind = 0; if (ahead > nera) ahead = nera; if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_cache_behind(&fs, VM_FAULT_CACHE_BEHIND); } else if (ahead > VM_FAULT_READ_AHEAD_MIN) ahead = VM_FAULT_READ_AHEAD_MIN; if (era != ahead) fs.entry->read_ahead = ahead; } /* * Call the pager to retrieve the data, if any, after * releasing the lock on the map. We hold a ref on * fs.object and the pages are VPO_BUSY'd. */ unlock_map(&fs); if (fs.object->type == OBJT_VNODE) { vp = fs.object->handle; if (vp == fs.vp) goto vnode_locked; else if (fs.vp != NULL) { vput(fs.vp); fs.vp = NULL; } locked = VOP_ISLOCKED(vp); if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* Do not sleep for vnode lock while fs.m is busy */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error != 0) { vhold(vp); release_page(&fs); unlock_and_deallocate(&fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); vdrop(vp); fs.vp = vp; KASSERT(error == 0, ("vm_fault: vget failed")); goto RetryFault; } fs.vp = vp; } vnode_locked: KASSERT(fs.vp == NULL || !fs.map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * now we find out if any other pages should be paged * in at this time this routine checks to see if the * pages surrounding this fault reside in the same * object as the page for this fault. If they do, * then they are faulted in also into the object. The * array "marray" returned contains an array of * vm_page_t structs where one of them is the * vm_page_t passed to the routine. The reqpage * return value is the index into the marray for the * vm_page_t passed to the routine. * * fs.m plus the additional pages are VPO_BUSY'd. */ faultcount = vm_fault_additional_pages( fs.m, behind, ahead, marray, &reqpage); rv = faultcount ? vm_pager_get_pages(fs.object, marray, faultcount, reqpage) : VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* * Found the page. Leave it busy while we play * with it. */ /* * Relookup in case pager changed page. Pager * is responsible for disposition of old page * if moved. */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (!fs.m) { unlock_and_deallocate(&fs); goto RetryFault; } hardfault++; break; /* break to PAGE HAS BEEN FOUND */ } /* * Remove the bogus page (which does not exist at this * object/offset); before doing so, we must get back * our object lock to preserve our invariant. * * Also wake up any other process that may want to bring * in this page. * * If this is the top-level object, we must leave the * busy page to prevent another process from rushing * past us, and inserting the page in that object at * the same time that we are. */ if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * Data outside the range of the pager or an I/O error */ /* * XXX - the check for kernel_map is a kludge to work * around having the machine panic on a kernel space * fault w/ I/O error. */ if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } if (fs.object != fs.first_object) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; /* * XXX - we cannot just fall out at this * point, m has been freed and is invalid! */ } } /* * We get here if the object has default pager (or unwiring) * or the pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); next_object = fs.object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; VM_OBJECT_WLOCK(fs.object); } fs.first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs.m->flags & PG_ZERO) == 0) { pmap_zero_page(fs.m); } else { PCPU_INC(cnt.v_ozfod); } PCPU_INC(cnt.v_zfod); fs.m->valid = VM_PAGE_BITS_ALL; break; /* break to PAGE HAS BEEN FOUND */ } else { KASSERT(fs.object != next_object, ("object loop %p", next_object)); VM_OBJECT_WLOCK(next_object); vm_object_pip_add(next_object, 1); if (fs.object != fs.first_object) vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = next_object; } } KASSERT((fs.m->oflags & VPO_BUSY) != 0, ("vm_fault: not busy after main loop")); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { /* * This allows pages to be virtually copied from a * backing_object into the first_object, where the * backing object has no other refs to it, and cannot * gain any more refs. Instead of a bcopy, we just * move the page from the backing object to the * first object. Note that we must mark the page * dirty in the first object so that it will go out * to swap when needed. */ is_first_object_locked = FALSE; if ( /* * Only one shadow object */ (fs.object->shadow_count == 1) && /* * No COW refs, except us */ (fs.object->ref_count == 1) && /* * No one else can look this object up */ (fs.object->handle == NULL) && /* * No other ways to look the object up */ ((fs.object->type == OBJT_DEFAULT) || (fs.object->type == OBJT_SWAP)) && (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) && /* * We don't chase down the shadow chain */ fs.object == fs.first_object->backing_object) { /* * get rid of the unnecessary page */ vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); /* * grab the page and put it into the * process'es object. The page is * automatically made dirty. */ vm_page_lock(fs.m); vm_page_rename(fs.m, fs.first_object, fs.first_pindex); vm_page_unlock(fs.m); vm_page_busy(fs.m); fs.first_m = fs.m; fs.m = NULL; PCPU_INC(cnt.v_cow_optim); } else { /* * Oh, well, lets copy it. */ pmap_copy_page(fs.m, fs.first_m); fs.first_m->valid = VM_PAGE_BITS_ALL; if (wired && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) { vm_page_lock(fs.first_m); vm_page_wire(fs.first_m); vm_page_unlock(fs.first_m); vm_page_lock(fs.m); vm_page_unwire(fs.m, FALSE); vm_page_unlock(fs.m); } /* * We no longer need the old page or object. */ release_page(&fs); } /* * fs.object != fs.first_object due to above * conditional */ vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); /* * Only use the new page below... */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; if (!is_first_object_locked) VM_OBJECT_WLOCK(fs.object); PCPU_INC(cnt.v_cow_faults); curthread->td_cow++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } fs.lookup_still_valid = TRUE; if (fs.map->timestamp != map_generation) { result = vm_map_lookup_locked(&fs.map, vaddr, fault_type, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); /* * If we don't need the page any longer, put it on the inactive * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { release_page(&fs); unlock_and_deallocate(&fs); /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) goto RetryFault; return (result); } if ((retry_object != fs.first_object) || (retry_pindex != fs.first_pindex)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; } } /* * If the page was filled by a pager, update the map entry's * last read offset. Since the pager does not return the * actual set of pages that it read, this update is based on * the requested set. Typically, the requested and actual * sets are the same. * * XXX The following assignment modifies the map * without holding a write lock on it. */ if (hardfault) fs.entry->next_read = fs.pindex + faultcount - reqpage; if ((prot & VM_PROT_WRITE) != 0 || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_object_set_writeable_dirty(fs.object); /* * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if (fs.entry->eflags & MAP_ENTRY_NOSYNC) { if (fs.m->dirty == 0) fs.m->oflags |= VPO_NOSYNC; } else { fs.m->oflags &= ~VPO_NOSYNC; } /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also tell the backing pager, if any, that it should remove * any swap backing since the page is now dirty. */ if (((fault_type & VM_PROT_WRITE) != 0 && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_page_dirty(fs.m); vm_pager_page_unswapped(fs.m); } } /* * Page had better still be busy */ KASSERT(fs.m->oflags & VPO_BUSY, ("vm_fault: page %p not busy!", fs.m)); /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ KASSERT(fs.m->valid == VM_PAGE_BITS_ALL, ("vm_fault: page %p partially invalid", fs.m)); VM_OBJECT_WUNLOCK(fs.object); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired); if ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 && wired == 0) vm_fault_prefault(fs.map->pmap, vaddr, fs.entry); VM_OBJECT_WLOCK(fs.object); vm_page_lock(fs.m); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if (fault_flags & VM_FAULT_CHANGE_WIRING) { if (wired) vm_page_wire(fs.m); else vm_page_unwire(fs.m, 1); } else vm_page_activate(fs.m); if (m_hold != NULL) { *m_hold = fs.m; vm_page_hold(fs.m); } vm_page_unlock(fs.m); vm_page_wakeup(fs.m); /* * Unlock everything, and return */ unlock_and_deallocate(&fs); if (hardfault) { PCPU_INC(cnt.v_io_faults); curthread->td_ru.ru_majflt++; } else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); }
/* * Routine: * vm_fault_copy_entry * Function: * Create new shadow object backing dst_entry with private copy of * all underlying pages. When src_entry is equal to dst_entry, * function implements COW for wired-down map entry. Otherwise, * it forks wired entry into dst_map. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, vm_map_entry_t dst_entry, vm_map_entry_t src_entry, vm_ooffset_t *fork_charge) { vm_object_t backing_object, dst_object, object, src_object; vm_pindex_t dst_pindex, pindex, src_pindex; vm_prot_t access, prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; boolean_t src_readonly, upgrade; #ifdef lint src_map++; #endif /* lint */ upgrade = src_entry == dst_entry; src_object = src_entry->object.vm_object; src_pindex = OFF_TO_IDX(src_entry->offset); src_readonly = (src_entry->protection & VM_PROT_WRITE) == 0; /* * Create the top-level object for the destination entry. (Doesn't * actually shadow anything - we copy the pages directly.) */ dst_object = vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(dst_entry->end - dst_entry->start)); #if VM_NRESERVLEVEL > 0 dst_object->flags |= OBJ_COLORED; dst_object->pg_color = atop(dst_entry->start); #endif VM_OBJECT_WLOCK(dst_object); KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; dst_object->charge = dst_entry->end - dst_entry->start; if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, ("vm_fault_copy_entry: leaked swp charge")); dst_object->cred = curthread->td_ucred; crhold(dst_object->cred); *fork_charge += dst_object->charge; } else { dst_object->cred = dst_entry->cred; dst_entry->cred = NULL; } access = prot = dst_entry->protection; /* * If not an upgrade, then enter the mappings in the pmap as * read and/or execute accesses. Otherwise, enter them as * write accesses. * * A writeable large page mapping is only created if all of * the constituent small page mappings are modified. Marking * PTEs as modified on inception allows promotion to happen * without taking potentially large number of soft faults. */ if (!upgrade) access &= ~VM_PROT_WRITE; /* * Loop through all of the virtual pages within the entry's * range, copying each page from the source object to the * destination object. Since the source is wired, those pages * must exist. In contrast, the destination is pageable. * Since the destination object does share any backing storage * with the source object, all of its pages must be dirtied, * regardless of whether they can be written. */ for (vaddr = dst_entry->start, dst_pindex = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_pindex++) { /* * Allocate a page in the destination object. */ do { dst_m = vm_page_alloc(dst_object, dst_pindex, VM_ALLOC_NORMAL); if (dst_m == NULL) { VM_OBJECT_WUNLOCK(dst_object); VM_WAIT; VM_OBJECT_WLOCK(dst_object); } } while (dst_m == NULL); /* * Find the page in the source object, and copy it in. * (Because the source is wired down, the page will be in * memory.) */ VM_OBJECT_WLOCK(src_object); object = src_object; pindex = src_pindex + dst_pindex; while ((src_m = vm_page_lookup(object, pindex)) == NULL && src_readonly && (backing_object = object->backing_object) != NULL) { /* * Allow fallback to backing objects if we are reading. */ VM_OBJECT_WLOCK(backing_object); pindex += OFF_TO_IDX(object->backing_object_offset); VM_OBJECT_WUNLOCK(object); object = backing_object; } if (src_m == NULL) panic("vm_fault_copy_wired: page missing"); pmap_copy_page(src_m, dst_m); VM_OBJECT_WUNLOCK(object); dst_m->valid = VM_PAGE_BITS_ALL; dst_m->dirty = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(dst_object); /* * Enter it in the pmap. If a wired, copy-on-write * mapping is being replaced by a write-enabled * mapping, then wire that new mapping. */ pmap_enter(dst_map->pmap, vaddr, access, dst_m, prot, upgrade); /* * Mark it no longer busy, and put it on the active list. */ VM_OBJECT_WLOCK(dst_object); if (upgrade) { vm_page_lock(src_m); vm_page_unwire(src_m, 0); vm_page_unlock(src_m); vm_page_lock(dst_m); vm_page_wire(dst_m); vm_page_unlock(dst_m); } else { vm_page_lock(dst_m); vm_page_activate(dst_m); vm_page_unlock(dst_m); } vm_page_wakeup(dst_m); } VM_OBJECT_WUNLOCK(dst_object); if (upgrade) { dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY); vm_object_deallocate(src_object); } }
int socow_setup(struct mbuf *m0, struct uio *uio) { struct sf_buf *sf; vm_page_t pp; struct iovec *iov; struct vmspace *vmspace; struct vm_map *map; vm_offset_t offset, uva; socow_stats.attempted++; vmspace = curproc->p_vmspace; map = &vmspace->vm_map; uva = (vm_offset_t) uio->uio_iov->iov_base; offset = uva & PAGE_MASK; /* * Verify that access to the given address is allowed from user-space. */ if (vm_fault_quick((caddr_t)uva, VM_PROT_READ) < 0) return (0); /* * verify page is mapped & not already wired for i/o */ pp = pmap_extract_and_hold(map->pmap, uva, VM_PROT_READ); if (pp == NULL) { socow_stats.fail_not_mapped++; return(0); } /* * set up COW */ vm_page_lock(pp); if (vm_page_cowsetup(pp) != 0) { vm_page_unhold(pp); vm_page_unlock(pp); return (0); } /* * wire the page for I/O */ vm_page_wire(pp); vm_page_unhold(pp); vm_page_unlock(pp); /* * Allocate an sf buf */ sf = sf_buf_alloc(pp, SFB_CATCH); if (sf == NULL) { vm_page_lock(pp); vm_page_cowclear(pp); vm_page_unwire(pp, 0); /* * Check for the object going away on us. This can * happen since we don't hold a reference to it. * If so, we're responsible for freeing the page. */ if (pp->wire_count == 0 && pp->object == NULL) vm_page_free(pp); vm_page_unlock(pp); socow_stats.fail_sf_buf++; return(0); } /* * attach to mbuf */ MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, socow_iodone, (void*)sf_buf_kva(sf), sf, M_RDONLY, EXT_SFBUF); m0->m_len = PAGE_SIZE - offset; m0->m_data = (caddr_t)sf_buf_kva(sf) + offset; socow_stats.success++; iov = uio->uio_iov; iov->iov_base = (char *)iov->iov_base + m0->m_len; iov->iov_len -= m0->m_len; uio->uio_resid -= m0->m_len; uio->uio_offset += m0->m_len; if (iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; } return(m0->m_len); }
int proc_rwmem(struct proc *p, struct uio *uio) { struct vmspace *vm; vm_map_t map; vm_object_t object = NULL; vm_offset_t pageno = 0; /* page number */ vm_prot_t reqprot; vm_offset_t kva; int error, writing; GIANT_REQUIRED; /* * if the vmspace is in the midst of being deallocated or the * process is exiting, don't try to grab anything. The page table * usage in that process can be messed up. */ vm = p->p_vmspace; if ((p->p_flag & P_WEXIT)) return (EFAULT); if (vm->vm_refcnt < 1) return (EFAULT); ++vm->vm_refcnt; /* * The map we want... */ map = &vm->vm_map; writing = uio->uio_rw == UIO_WRITE; reqprot = writing ? (VM_PROT_WRITE | VM_PROT_OVERRIDE_WRITE) : VM_PROT_READ; kva = kmem_alloc_pageable(kernel_map, PAGE_SIZE); /* * Only map in one page at a time. We don't have to, but it * makes things easier. This way is trivial - right? */ do { vm_map_t tmap; vm_offset_t uva; int page_offset; /* offset into page */ vm_map_entry_t out_entry; vm_prot_t out_prot; boolean_t wired; vm_pindex_t pindex; u_int len; vm_page_t m; object = NULL; uva = (vm_offset_t)uio->uio_offset; /* * Get the page number of this segment. */ pageno = trunc_page(uva); page_offset = uva - pageno; /* * How many bytes to copy */ len = min(PAGE_SIZE - page_offset, uio->uio_resid); /* * Fault the page on behalf of the process */ error = vm_fault(map, pageno, reqprot, VM_FAULT_NORMAL); if (error) { error = EFAULT; break; } /* * Now we need to get the page. out_entry, out_prot, wired, * and single_use aren't used. One would think the vm code * would be a *bit* nicer... We use tmap because * vm_map_lookup() can change the map argument. */ tmap = map; error = vm_map_lookup(&tmap, pageno, reqprot, &out_entry, &object, &pindex, &out_prot, &wired); if (error) { error = EFAULT; /* * Make sure that there is no residue in 'object' from * an error return on vm_map_lookup. */ object = NULL; break; } m = vm_page_lookup(object, pindex); /* Allow fallback to backing objects if we are reading */ while (m == NULL && !writing && object->backing_object) { pindex += OFF_TO_IDX(object->backing_object_offset); object = object->backing_object; m = vm_page_lookup(object, pindex); } if (m == NULL) { error = EFAULT; /* * Make sure that there is no residue in 'object' from * an error return on vm_map_lookup. */ object = NULL; vm_map_lookup_done(tmap, out_entry); break; } /* * Wire the page into memory */ vm_page_lock_queues(); vm_page_wire(m); vm_page_unlock_queues(); /* * We're done with tmap now. * But reference the object first, so that we won't loose * it. */ vm_object_reference(object); vm_map_lookup_done(tmap, out_entry); pmap_qenter(kva, &m, 1); /* * Now do the i/o move. */ error = uiomove((caddr_t)(kva + page_offset), len, uio); pmap_qremove(kva, 1); /* * release the page and the object */ vm_page_lock_queues(); vm_page_unwire(m, 1); vm_page_unlock_queues(); vm_object_deallocate(object); object = NULL; } while (error == 0 && uio->uio_resid > 0); if (object) vm_object_deallocate(object); kmem_free(kernel_map, kva, PAGE_SIZE); vmspace_free(vm); return (error); }
/* * vm_contig_pg_alloc: * * Allocate contiguous pages from the VM. This function does not * map the allocated pages into the kernel map, otherwise it is * impossible to make large allocations (i.e. >2G). * * Malloc()'s data structures have been used for collection of * statistics and for allocations of less than a page. */ static int vm_contig_pg_alloc(unsigned long size, vm_paddr_t low, vm_paddr_t high, unsigned long alignment, unsigned long boundary, int mflags) { int i, q, start, pass; vm_offset_t phys; vm_page_t pga = vm_page_array; vm_page_t m; int pqtype; size = round_page(size); if (size == 0) panic("vm_contig_pg_alloc: size must not be 0"); if ((alignment & (alignment - 1)) != 0) panic("vm_contig_pg_alloc: alignment must be a power of 2"); if ((boundary & (boundary - 1)) != 0) panic("vm_contig_pg_alloc: boundary must be a power of 2"); /* * See if we can get the pages from the contiguous page reserve * alist. The returned pages will be allocated and wired but not * busied. */ m = vm_page_alloc_contig(low, high, alignment, boundary, size); if (m) return (m - &pga[0]); /* * Three passes (0, 1, 2). Each pass scans the VM page list for * free or cached pages. After each pass if the entire scan failed * we attempt to flush inactive pages and reset the start index back * to 0. For passes 1 and 2 we also attempt to flush active pages. */ start = 0; for (pass = 0; pass < 3; pass++) { /* * Find first page in array that is free, within range, * aligned, and such that the boundary won't be crossed. */ again: for (i = start; i < vmstats.v_page_count; i++) { m = &pga[i]; phys = VM_PAGE_TO_PHYS(m); pqtype = m->queue - m->pc; if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) && (phys >= low) && (phys < high) && ((phys & (alignment - 1)) == 0) && (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0) && m->busy == 0 && m->wire_count == 0 && m->hold_count == 0 && (m->flags & (PG_BUSY | PG_NEED_COMMIT)) == 0) { break; } } /* * If we cannot find the page in the given range, or we have * crossed the boundary, call the vm_contig_pg_clean() function * for flushing out the queues, and returning it back to * normal state. */ if ((i == vmstats.v_page_count) || ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) { /* * Best effort flush of all inactive pages. * This is quite quick, for now stall all * callers, even if they've specified M_NOWAIT. */ for (q = 0; q < PQ_L2_SIZE; ++q) { vm_contig_pg_clean(PQ_INACTIVE + q, vmstats.v_inactive_count); lwkt_yield(); } /* * Best effort flush of active pages. * * This is very, very slow. * Only do this if the caller has agreed to M_WAITOK. * * If enough pages are flushed, we may succeed on * next (final) pass, if not the caller, contigmalloc(), * will fail in the index < 0 case. */ if (pass > 0 && (mflags & M_WAITOK)) { for (q = 0; q < PQ_L2_SIZE; ++q) { vm_contig_pg_clean(PQ_ACTIVE + q, vmstats.v_active_count); } lwkt_yield(); } /* * We're already too high in the address space * to succeed, reset to 0 for the next iteration. */ start = 0; continue; /* next pass */ } start = i; /* * Check successive pages for contiguous and free. * * (still in critical section) */ for (i = start + 1; i < (start + size / PAGE_SIZE); i++) { m = &pga[i]; pqtype = m->queue - m->pc; if ((VM_PAGE_TO_PHYS(&m[0]) != (VM_PAGE_TO_PHYS(&m[-1]) + PAGE_SIZE)) || ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE)) || m->busy || m->wire_count || m->hold_count || (m->flags & (PG_BUSY | PG_NEED_COMMIT))) { start++; goto again; } } /* * Try to allocate the pages, wiring them as we go. * * (still in critical section) */ for (i = start; i < (start + size / PAGE_SIZE); i++) { m = &pga[i]; if (vm_page_busy_try(m, TRUE)) { vm_contig_pg_free(start, (i - start) * PAGE_SIZE); start++; goto again; } pqtype = m->queue - m->pc; if (pqtype == PQ_CACHE && m->hold_count == 0 && m->wire_count == 0 && (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) == 0) { vm_page_protect(m, VM_PROT_NONE); KKASSERT((m->flags & PG_MAPPED) == 0); KKASSERT(m->dirty == 0); vm_page_free(m); --i; continue; /* retry the page */ } if (pqtype != PQ_FREE || m->hold_count) { vm_page_wakeup(m); vm_contig_pg_free(start, (i - start) * PAGE_SIZE); start++; goto again; } KKASSERT((m->valid & m->dirty) == 0); KKASSERT(m->wire_count == 0); KKASSERT(m->object == NULL); vm_page_unqueue_nowakeup(m); m->valid = VM_PAGE_BITS_ALL; if (m->flags & PG_ZERO) vm_page_zero_count--; KASSERT(m->dirty == 0, ("vm_contig_pg_alloc: page %p was dirty", m)); KKASSERT(m->wire_count == 0); KKASSERT(m->busy == 0); /* * Clear all flags except PG_BUSY, PG_ZERO, and * PG_WANTED, then unbusy the now allocated page. */ vm_page_flag_clear(m, ~(PG_BUSY | PG_SBUSY | PG_ZERO | PG_WANTED)); vm_page_wire(m); vm_page_wakeup(m); } /* * Our job is done, return the index page of vm_page_array. */ return (start); /* aka &pga[start] */ } /* * Failed. */ return (-1); }