/* * No requirements. */ int useracc(c_caddr_t addr, int len, int rw) { boolean_t rv; vm_prot_t prot; vm_map_t map; vm_map_entry_t save_hint; vm_offset_t wrap; KASSERT((rw & (~VM_PROT_ALL)) == 0, ("illegal ``rw'' argument to useracc (%x)", rw)); prot = rw; /* * XXX - check separately to disallow access to user area and user * page tables - they are in the map. */ wrap = (vm_offset_t)addr + len; if (wrap > VM_MAX_USER_ADDRESS || wrap < (vm_offset_t)addr) { return (FALSE); } map = &curproc->p_vmspace->vm_map; vm_map_lock_read(map); /* * We save the map hint, and restore it. Useracc appears to distort * the map hint unnecessarily. */ save_hint = map->hint; rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr), round_page(wrap), prot, TRUE); map->hint = save_hint; vm_map_unlock_read(map); return (rv == TRUE); }
static void get_proc_size_info(struct lwp *l, unsigned long *stext, unsigned long *etext, unsigned long *sstack) { struct proc *p = l->l_proc; struct vmspace *vm; struct vm_map *map; struct vm_map_entry *entry; *stext = 0; *etext = 0; *sstack = 0; proc_vmspace_getref(p, &vm); map = &vm->vm_map; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { if (UVM_ET_ISSUBMAP(entry)) continue; /* assume text is the first entry */ if (*stext == *etext) { *stext = entry->start; *etext = entry->end; break; } } #if defined(LINUX_USRSTACK32) && defined(USRSTACK32) if (strcmp(p->p_emul->e_name, "linux32") == 0 && LINUX_USRSTACK32 < USRSTACK32) *sstack = (unsigned long)LINUX_USRSTACK32; else #endif #ifdef LINUX_USRSTACK if (strcmp(p->p_emul->e_name, "linux") == 0 && LINUX_USRSTACK < USRSTACK) *sstack = (unsigned long)LINUX_USRSTACK; else #endif #ifdef USRSTACK32 if (strstr(p->p_emul->e_name, "32") != NULL) *sstack = (unsigned long)USRSTACK32; else #endif *sstack = (unsigned long)USRSTACK; /* * jdk 1.6 compares low <= addr && addr < high * if we put addr == high, then the test fails * so eat one page. */ *sstack -= PAGE_SIZE; vm_map_unlock_read(map); uvmspace_free(vm); }
static int sysctl_kmem_map_free(SYSCTL_HANDLER_ARGS) { u_long size; vm_map_lock_read(kmem_map); size = kmem_map->root != NULL ? kmem_map->root->max_free : kmem_map->max_offset - kmem_map->min_offset; vm_map_unlock_read(kmem_map); return (sysctl_handle_long(oidp, &size, 0, req)); }
/* * No requirements. */ int useracc(c_caddr_t addr, int len, int rw) { boolean_t rv; vm_prot_t prot; vm_map_t map; vm_offset_t wrap; vm_offset_t gpa; KASSERT((rw & (~VM_PROT_ALL)) == 0, ("illegal ``rw'' argument to useracc (%x)", rw)); prot = rw; if (curthread->td_vmm) { if (vmm_vm_get_gpa(curproc, (register_t *)&gpa, (register_t) addr)) panic("%s: could not get GPA\n", __func__); addr = (c_caddr_t) gpa; } /* * XXX - check separately to disallow access to user area and user * page tables - they are in the map. */ wrap = (vm_offset_t)addr + len; if (wrap > VM_MAX_USER_ADDRESS || wrap < (vm_offset_t)addr) { return (FALSE); } map = &curproc->p_vmspace->vm_map; vm_map_lock_read(map); rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr), round_page(wrap), prot, TRUE); vm_map_unlock_read(map); return (rv == TRUE); }
int fill_procregioninfo(task_t task, uint64_t arg, struct proc_regioninfo_internal *pinfo, uint32_t *vnodeaddr, uint32_t *vid) { vm_map_t map; vm_map_offset_t address = (vm_map_offset_t )arg; vm_map_entry_t tmp_entry; vm_map_entry_t entry; vm_map_offset_t start; vm_region_extended_info_data_t extended; vm_region_top_info_data_t top; task_lock(task); map = task->map; if (map == VM_MAP_NULL) { task_unlock(task); return(0); } vm_map_reference(map); task_unlock(task); vm_map_lock_read(map); start = address; if (!vm_map_lookup_entry(map, start, &tmp_entry)) { if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { vm_map_unlock_read(map); vm_map_deallocate(map); return(0); } } else { entry = tmp_entry; } start = entry->vme_start; pinfo->pri_offset = entry->offset; pinfo->pri_protection = entry->protection; pinfo->pri_max_protection = entry->max_protection; pinfo->pri_inheritance = entry->inheritance; pinfo->pri_behavior = entry->behavior; pinfo->pri_user_wired_count = entry->user_wired_count; pinfo->pri_user_tag = entry->alias; if (entry->is_sub_map) { pinfo->pri_flags |= PROC_REGION_SUBMAP; } else { if (entry->is_shared) pinfo->pri_flags |= PROC_REGION_SHARED; } extended.protection = entry->protection; extended.user_tag = entry->alias; extended.pages_resident = 0; extended.pages_swapped_out = 0; extended.pages_shared_now_private = 0; extended.pages_dirtied = 0; extended.external_pager = 0; extended.shadow_depth = 0; vm_map_region_walk(map, start, entry, entry->offset, entry->vme_end - start, &extended); if (extended.external_pager && extended.ref_count == 2 && extended.share_mode == SM_SHARED) extended.share_mode = SM_PRIVATE; top.private_pages_resident = 0; top.shared_pages_resident = 0; vm_map_region_top_walk(entry, &top); pinfo->pri_pages_resident = extended.pages_resident; pinfo->pri_pages_shared_now_private = extended.pages_shared_now_private; pinfo->pri_pages_swapped_out = extended.pages_swapped_out; pinfo->pri_pages_dirtied = extended.pages_dirtied; pinfo->pri_ref_count = extended.ref_count; pinfo->pri_shadow_depth = extended.shadow_depth; pinfo->pri_share_mode = extended.share_mode; pinfo->pri_private_pages_resident = top.private_pages_resident; pinfo->pri_shared_pages_resident = top.shared_pages_resident; pinfo->pri_obj_id = top.obj_id; pinfo->pri_address = (uint64_t)start; pinfo->pri_size = (uint64_t)(entry->vme_end - start); pinfo->pri_depth = 0; if ((vnodeaddr != 0) && (entry->is_sub_map == 0)) { *vnodeaddr = (uint32_t)0; if (fill_vnodeinfoforaddr(entry, vnodeaddr, vid) ==0) { vm_map_unlock_read(map); vm_map_deallocate(map); return(1); } } vm_map_unlock_read(map); vm_map_deallocate(map); return(1); }
/* * mincore system call handler * * mincore_args(const void *addr, size_t len, char *vec) * * No requirements */ int sys_mincore(struct mincore_args *uap) { struct proc *p = curproc; vm_offset_t addr, first_addr; vm_offset_t end, cend; pmap_t pmap; vm_map_t map; char *vec; int error; int vecindex, lastvecindex; vm_map_entry_t current; vm_map_entry_t entry; int mincoreinfo; unsigned int timestamp; /* * Make sure that the addresses presented are valid for user * mode. */ first_addr = addr = trunc_page((vm_offset_t) uap->addr); end = addr + (vm_size_t)round_page(uap->len); if (end < addr) return (EINVAL); if (VM_MAX_USER_ADDRESS > 0 && end > VM_MAX_USER_ADDRESS) return (EINVAL); /* * Address of byte vector */ vec = uap->vec; map = &p->p_vmspace->vm_map; pmap = vmspace_pmap(p->p_vmspace); lwkt_gettoken(&map->token); vm_map_lock_read(map); RestartScan: timestamp = map->timestamp; if (!vm_map_lookup_entry(map, addr, &entry)) entry = entry->next; /* * Do this on a map entry basis so that if the pages are not * in the current processes address space, we can easily look * up the pages elsewhere. */ lastvecindex = -1; for(current = entry; (current != &map->header) && (current->start < end); current = current->next) { /* * ignore submaps (for now) or null objects */ if (current->maptype != VM_MAPTYPE_NORMAL && current->maptype != VM_MAPTYPE_VPAGETABLE) { continue; } if (current->object.vm_object == NULL) continue; /* * limit this scan to the current map entry and the * limits for the mincore call */ if (addr < current->start) addr = current->start; cend = current->end; if (cend > end) cend = end; /* * scan this entry one page at a time */ while (addr < cend) { /* * Check pmap first, it is likely faster, also * it can provide info as to whether we are the * one referencing or modifying the page. * * If we have to check the VM object, only mess * around with normal maps. Do not mess around * with virtual page tables (XXX). */ mincoreinfo = pmap_mincore(pmap, addr); if (mincoreinfo == 0 && current->maptype == VM_MAPTYPE_NORMAL) { vm_pindex_t pindex; vm_ooffset_t offset; vm_page_t m; /* * calculate the page index into the object */ offset = current->offset + (addr - current->start); pindex = OFF_TO_IDX(offset); /* * if the page is resident, then gather * information about it. spl protection is * required to maintain the object * association. And XXX what if the page is * busy? What's the deal with that? * * XXX vm_token - legacy for pmap_ts_referenced * in i386 and vkernel pmap code. */ lwkt_gettoken(&vm_token); vm_object_hold(current->object.vm_object); m = vm_page_lookup(current->object.vm_object, pindex); if (m && m->valid) { mincoreinfo = MINCORE_INCORE; if (m->dirty || pmap_is_modified(m)) mincoreinfo |= MINCORE_MODIFIED_OTHER; if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) { vm_page_flag_set(m, PG_REFERENCED); mincoreinfo |= MINCORE_REFERENCED_OTHER; } } vm_object_drop(current->object.vm_object); lwkt_reltoken(&vm_token); } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * calculate index into user supplied byte vector */ vecindex = OFF_TO_IDX(addr - first_addr); /* * If we have skipped map entries, we need to make sure that * the byte vector is zeroed for those skipped entries. */ while((lastvecindex + 1) < vecindex) { error = subyte( vec + lastvecindex, 0); if (error) { error = EFAULT; goto done; } ++lastvecindex; } /* * Pass the page information to the user */ error = subyte( vec + vecindex, mincoreinfo); if (error) { error = EFAULT; goto done; } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; lastvecindex = vecindex; addr += PAGE_SIZE; } } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * Zero the last entries in the byte vector. */ vecindex = OFF_TO_IDX(end - first_addr); while((lastvecindex + 1) < vecindex) { error = subyte( vec + lastvecindex, 0); if (error) { error = EFAULT; goto done; } ++lastvecindex; } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; vm_map_unlock_read(map); error = 0; done: lwkt_reltoken(&map->token); return (error); }
/* * msync system call handler * * msync_args(void *addr, size_t len, int flags) * * No requirements */ int sys_msync(struct msync_args *uap) { struct proc *p = curproc; vm_offset_t addr; vm_offset_t tmpaddr; vm_size_t size, pageoff; int flags; vm_map_t map; int rv; addr = (vm_offset_t) uap->addr; size = uap->len; flags = uap->flags; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (size < uap->len) /* wrap */ return(EINVAL); tmpaddr = addr + size; /* workaround gcc4 opt */ if (tmpaddr < addr) /* wrap */ return(EINVAL); if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) return (EINVAL); map = &p->p_vmspace->vm_map; /* * map->token serializes extracting the address range for size == 0 * msyncs with the vm_map_clean call; if the token were not held * across the two calls, an intervening munmap/mmap pair, for example, * could cause msync to occur on a wrong region. */ lwkt_gettoken(&map->token); /* * XXX Gak! If size is zero we are supposed to sync "all modified * pages with the region containing addr". Unfortunately, we don't * really keep track of individual mmaps so we approximate by flushing * the range of the map entry containing addr. This can be incorrect * if the region splits or is coalesced with a neighbor. */ if (size == 0) { vm_map_entry_t entry; vm_map_lock_read(map); rv = vm_map_lookup_entry(map, addr, &entry); if (rv == FALSE) { vm_map_unlock_read(map); rv = KERN_INVALID_ADDRESS; goto done; } addr = entry->start; size = entry->end - entry->start; vm_map_unlock_read(map); } /* * Clean the pages and interpret the return value. */ rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, (flags & MS_INVALIDATE) != 0); done: lwkt_reltoken(&map->token); switch (rv) { case KERN_SUCCESS: break; case KERN_INVALID_ADDRESS: return (EINVAL); /* Sun returns ENOMEM? */ case KERN_FAILURE: return (EIO); default: return (EINVAL); } return (0); }
/* ARGSUSED */ int sys_mincore(struct proc *p, void *v, register_t *retval) { struct sys_mincore_args /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(char *) vec; } */ *uap = v; vm_page_t m; char *vec, pgi; struct uvm_object *uobj; struct vm_amap *amap; struct vm_anon *anon; vm_map_entry_t entry; vaddr_t start, end, lim; vm_map_t map; vsize_t len, npgs; int error = 0; map = &p->p_vmspace->vm_map; start = (vaddr_t)SCARG(uap, addr); len = SCARG(uap, len); vec = SCARG(uap, vec); if (start & PAGE_MASK) return (EINVAL); len = round_page(len); end = start + len; if (end <= start) return (EINVAL); npgs = len >> PAGE_SHIFT; /* * Lock down vec, so our returned status isn't outdated by * storing the status byte for a page. */ if ((error = uvm_vslock(p, vec, npgs, VM_PROT_WRITE)) != 0) return (error); vm_map_lock_read(map); if (uvm_map_lookup_entry(map, start, &entry) == FALSE) { error = ENOMEM; goto out; } for (/* nothing */; entry != &map->header && entry->start < end; entry = entry->next) { KASSERT(!UVM_ET_ISSUBMAP(entry)); KASSERT(start >= entry->start); /* Make sure there are no holes. */ if (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end)) { error = ENOMEM; goto out; } lim = end < entry->end ? end : entry->end; /* * Special case for objects with no "real" pages. Those * are always considered resident (mapped devices). */ if (UVM_ET_ISOBJ(entry)) { KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)); if (entry->object.uvm_obj->pgops->pgo_releasepg == NULL) { pgi = 1; for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) copyout(&pgi, vec, sizeof(char)); continue; } } amap = entry->aref.ar_amap; /* top layer */ uobj = entry->object.uvm_obj; /* bottom layer */ if (uobj != NULL) simple_lock(&uobj->vmobjlock); for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) { pgi = 0; if (amap != NULL) { /* Check the top layer first. */ anon = amap_lookup(&entry->aref, start - entry->start); /* Don't need to lock anon here. */ if (anon != NULL && anon->an_page != NULL) { /* * Anon has the page for this entry * offset. */ pgi = 1; } } if (uobj != NULL && pgi == 0) { /* Check the bottom layer. */ m = uvm_pagelookup(uobj, entry->offset + (start - entry->start)); if (m != NULL) { /* * Object has the page for this entry * offset. */ pgi = 1; } } copyout(&pgi, vec, sizeof(char)); } if (uobj != NULL) simple_unlock(&uobj->vmobjlock); } out: vm_map_unlock_read(map); uvm_vsunlock(p, SCARG(uap, vec), npgs); return (error); }
int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { vm_prot_t prot; long ahead, behind; int alloc_req, era, faultcount, nera, reqpage, result; boolean_t growstack, is_first_object_locked, wired; int map_generation; vm_object_t next_object; vm_page_t marray[VM_FAULT_READ_MAX]; int hardfault; struct faultstate fs; struct vnode *vp; int locked, error; hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); fs.vp = NULL; fs.vfslocked = 0; faultcount = reqpage = 0; RetryFault:; /* * Find the backing store object and offset into it to begin the * search. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { if (growstack && result == KERN_INVALID_ADDRESS && map != kernel_map) { result = vm_map_growstack(curproc, vaddr); if (result != KERN_SUCCESS) return (KERN_FAILURE); growstack = FALSE; goto RetryFault; } return (result); } map_generation = fs.map->timestamp; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { if ((curthread->td_pflags & TDP_DEVMEMIO) != 0) { vm_map_unlock_read(fs.map); return (KERN_FAILURE); } panic("vm_fault: fault on nofault entry, addr: %lx", (u_long)vaddr); } if (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION && fs.entry->wiring_thread != curthread) { vm_map_unlock_read(fs.map); vm_map_lock(fs.map); if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) && (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) { fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(fs.map, 0); } else vm_map_unlock(fs.map); goto RetryFault; } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. This must be done after * obtaining the vnode lock in order to avoid possible deadlocks. */ VM_OBJECT_LOCK(fs.first_object); vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.lookup_still_valid = TRUE; if (wired) fault_type = prot | (fault_type & VM_PROT_COPY); fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { /* * If the object is dead, we stop here */ if (fs.object->flags & OBJ_DEAD) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { /* * check for page-based copy on write. * We check fs.object == fs.first_object so * as to ensure the legacy COW mechanism is * used when the page in question is part of * a shadow object. Otherwise, vm_page_cowfault() * removes the page from the backing object, * which is not what we want. */ vm_page_lock(fs.m); if ((fs.m->cow) && (fault_type & VM_PROT_WRITE) && (fs.object == fs.first_object)) { vm_page_cowfault(fs.m); unlock_and_deallocate(&fs); goto RetryFault; } /* * Wait/Retry if the page is busy. We have to do this * if the page is busy via either VPO_BUSY or * vm_page_t->busy because the vm_pager may be using * vm_page_t->busy for pageouts ( and even pageins if * it is the vnode pager ), and we could end up trying * to pagein and pageout the same page simultaneously. * * We can theoretically allow the busy case on a read * fault if the page is marked valid, but since such * pages are typically already pmap'd, putting that * special case in might be more effort then it is * worth. We cannot under any circumstances mess * around with a vm_page_t->busy page except, perhaps, * to pmap it. */ if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs.m, PGA_REFERENCED); vm_page_unlock(fs.m); if (fs.object != fs.first_object) { if (!VM_OBJECT_TRYLOCK( fs.first_object)) { VM_OBJECT_UNLOCK(fs.object); VM_OBJECT_LOCK(fs.first_object); VM_OBJECT_LOCK(fs.object); } vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); vm_object_pip_wakeup(fs.first_object); VM_OBJECT_UNLOCK(fs.first_object); fs.first_m = NULL; } unlock_map(&fs); if (fs.m == vm_page_lookup(fs.object, fs.pindex)) { vm_page_sleep_if_busy(fs.m, TRUE, "vmpfw"); } vm_object_pip_wakeup(fs.object); VM_OBJECT_UNLOCK(fs.object); PCPU_INC(cnt.v_intrans); vm_object_deallocate(fs.first_object); goto RetryFault; } vm_pageq_remove(fs.m); vm_page_unlock(fs.m); /* * Mark page busy for other processes, and the * pagedaemon. If it still isn't completely valid * (readable), jump to readrest, else break-out ( we * found the page ). */ vm_page_busy(fs.m); if (fs.m->valid != VM_PAGE_BITS_ALL) goto readrest; break; } /* * Page is not resident, If this is the search termination * or the pager might contain the page, allocate a new page. */ if (TRYPAGER || fs.object == fs.first_object) { if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At * worst, the P_KILLED might be not observed * there, and allocation can fail, causing * restart and new reading of the p_flag. */ fs.m = NULL; if (!vm_page_count_severe() || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 if ((fs.object->flags & OBJ_COLORED) == 0) { fs.object->flags |= OBJ_COLORED; fs.object->pg_color = atop(vaddr) - fs.pindex; } #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs.object->type != OBJT_VNODE && fs.object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; fs.m = vm_page_alloc(fs.object, fs.pindex, alloc_req); } if (fs.m == NULL) { unlock_and_deallocate(&fs); VM_WAITPFAULT; goto RetryFault; } else if (fs.m->valid == VM_PAGE_BITS_ALL) break; } readrest: /* * We have found a valid page or we have allocated a new page. * The page thus may not be valid or may not be entirely * valid. * * Attempt to fault-in the page if there is a chance that the * pager has it, and potentially fault in additional pages * at the same time. */ if (TRYPAGER) { int rv; u_char behavior = vm_map_entry_behavior(fs.entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { behind = 0; ahead = atop(fs.entry->end - vaddr) - 1; if (ahead > VM_FAULT_READ_AHEAD_MAX) ahead = VM_FAULT_READ_AHEAD_MAX; if (fs.pindex == fs.entry->next_read) vm_fault_cache_behind(&fs, VM_FAULT_READ_MAX); } else { /* * If this is a sequential page fault, then * arithmetically increase the number of pages * in the read-ahead window. Otherwise, reset * the read-ahead window to its smallest size. */ behind = atop(vaddr - fs.entry->start); if (behind > VM_FAULT_READ_BEHIND) behind = VM_FAULT_READ_BEHIND; ahead = atop(fs.entry->end - vaddr) - 1; era = fs.entry->read_ahead; if (fs.pindex == fs.entry->next_read) { nera = era + behind; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; behind = 0; if (ahead > nera) ahead = nera; if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_cache_behind(&fs, VM_FAULT_CACHE_BEHIND); } else if (ahead > VM_FAULT_READ_AHEAD_MIN) ahead = VM_FAULT_READ_AHEAD_MIN; if (era != ahead) fs.entry->read_ahead = ahead; } /* * Call the pager to retrieve the data, if any, after * releasing the lock on the map. We hold a ref on * fs.object and the pages are VPO_BUSY'd. */ unlock_map(&fs); vnode_lock: if (fs.object->type == OBJT_VNODE) { vp = fs.object->handle; if (vp == fs.vp) goto vnode_locked; else if (fs.vp != NULL) { vput(fs.vp); fs.vp = NULL; } locked = VOP_ISLOCKED(vp); if (VFS_NEEDSGIANT(vp->v_mount) && !fs.vfslocked) { fs.vfslocked = 1; if (!mtx_trylock(&Giant)) { VM_OBJECT_UNLOCK(fs.object); mtx_lock(&Giant); VM_OBJECT_LOCK(fs.object); goto vnode_lock; } } if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* Do not sleep for vnode lock while fs.m is busy */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error != 0) { int vfslocked; vfslocked = fs.vfslocked; fs.vfslocked = 0; /* Keep Giant */ vhold(vp); release_page(&fs); unlock_and_deallocate(&fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); vdrop(vp); fs.vp = vp; fs.vfslocked = vfslocked; KASSERT(error == 0, ("vm_fault: vget failed")); goto RetryFault; } fs.vp = vp; } vnode_locked: KASSERT(fs.vp == NULL || !fs.map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * now we find out if any other pages should be paged * in at this time this routine checks to see if the * pages surrounding this fault reside in the same * object as the page for this fault. If they do, * then they are faulted in also into the object. The * array "marray" returned contains an array of * vm_page_t structs where one of them is the * vm_page_t passed to the routine. The reqpage * return value is the index into the marray for the * vm_page_t passed to the routine. * * fs.m plus the additional pages are VPO_BUSY'd. */ faultcount = vm_fault_additional_pages( fs.m, behind, ahead, marray, &reqpage); rv = faultcount ? vm_pager_get_pages(fs.object, marray, faultcount, reqpage) : VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* * Found the page. Leave it busy while we play * with it. */ /* * Relookup in case pager changed page. Pager * is responsible for disposition of old page * if moved. */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (!fs.m) { unlock_and_deallocate(&fs); goto RetryFault; } hardfault++; break; /* break to PAGE HAS BEEN FOUND */ } /* * Remove the bogus page (which does not exist at this * object/offset); before doing so, we must get back * our object lock to preserve our invariant. * * Also wake up any other process that may want to bring * in this page. * * If this is the top-level object, we must leave the * busy page to prevent another process from rushing * past us, and inserting the page in that object at * the same time that we are. */ if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * Data outside the range of the pager or an I/O error */ /* * XXX - the check for kernel_map is a kludge to work * around having the machine panic on a kernel space * fault w/ I/O error. */ if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } if (fs.object != fs.first_object) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; /* * XXX - we cannot just fall out at this * point, m has been freed and is invalid! */ } } /* * We get here if the object has default pager (or unwiring) * or the pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); next_object = fs.object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); VM_OBJECT_UNLOCK(fs.object); fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; VM_OBJECT_LOCK(fs.object); } fs.first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs.m->flags & PG_ZERO) == 0) { pmap_zero_page(fs.m); } else { PCPU_INC(cnt.v_ozfod); } PCPU_INC(cnt.v_zfod); fs.m->valid = VM_PAGE_BITS_ALL; break; /* break to PAGE HAS BEEN FOUND */ } else { KASSERT(fs.object != next_object, ("object loop %p", next_object)); VM_OBJECT_LOCK(next_object); vm_object_pip_add(next_object, 1); if (fs.object != fs.first_object) vm_object_pip_wakeup(fs.object); VM_OBJECT_UNLOCK(fs.object); fs.object = next_object; } } KASSERT((fs.m->oflags & VPO_BUSY) != 0, ("vm_fault: not busy after main loop")); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { /* * This allows pages to be virtually copied from a * backing_object into the first_object, where the * backing object has no other refs to it, and cannot * gain any more refs. Instead of a bcopy, we just * move the page from the backing object to the * first object. Note that we must mark the page * dirty in the first object so that it will go out * to swap when needed. */ is_first_object_locked = FALSE; if ( /* * Only one shadow object */ (fs.object->shadow_count == 1) && /* * No COW refs, except us */ (fs.object->ref_count == 1) && /* * No one else can look this object up */ (fs.object->handle == NULL) && /* * No other ways to look the object up */ ((fs.object->type == OBJT_DEFAULT) || (fs.object->type == OBJT_SWAP)) && (is_first_object_locked = VM_OBJECT_TRYLOCK(fs.first_object)) && /* * We don't chase down the shadow chain */ fs.object == fs.first_object->backing_object) { /* * get rid of the unnecessary page */ vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); /* * grab the page and put it into the * process'es object. The page is * automatically made dirty. */ vm_page_lock(fs.m); vm_page_rename(fs.m, fs.first_object, fs.first_pindex); vm_page_unlock(fs.m); vm_page_busy(fs.m); fs.first_m = fs.m; fs.m = NULL; PCPU_INC(cnt.v_cow_optim); } else { /* * Oh, well, lets copy it. */ pmap_copy_page(fs.m, fs.first_m); fs.first_m->valid = VM_PAGE_BITS_ALL; if (wired && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) { vm_page_lock(fs.first_m); vm_page_wire(fs.first_m); vm_page_unlock(fs.first_m); vm_page_lock(fs.m); vm_page_unwire(fs.m, FALSE); vm_page_unlock(fs.m); } /* * We no longer need the old page or object. */ release_page(&fs); } /* * fs.object != fs.first_object due to above * conditional */ vm_object_pip_wakeup(fs.object); VM_OBJECT_UNLOCK(fs.object); /* * Only use the new page below... */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; if (!is_first_object_locked) VM_OBJECT_LOCK(fs.object); PCPU_INC(cnt.v_cow_faults); curthread->td_cow++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } fs.lookup_still_valid = TRUE; if (fs.map->timestamp != map_generation) { result = vm_map_lookup_locked(&fs.map, vaddr, fault_type, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); /* * If we don't need the page any longer, put it on the inactive * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { release_page(&fs); unlock_and_deallocate(&fs); /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) goto RetryFault; return (result); } if ((retry_object != fs.first_object) || (retry_pindex != fs.first_pindex)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; } } /* * If the page was filled by a pager, update the map entry's * last read offset. Since the pager does not return the * actual set of pages that it read, this update is based on * the requested set. Typically, the requested and actual * sets are the same. * * XXX The following assignment modifies the map * without holding a write lock on it. */ if (hardfault) fs.entry->next_read = fs.pindex + faultcount - reqpage; if ((prot & VM_PROT_WRITE) != 0 || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_object_set_writeable_dirty(fs.object); /* * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if (fs.entry->eflags & MAP_ENTRY_NOSYNC) { if (fs.m->dirty == 0) fs.m->oflags |= VPO_NOSYNC; } else { fs.m->oflags &= ~VPO_NOSYNC; } /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also tell the backing pager, if any, that it should remove * any swap backing since the page is now dirty. */ if (((fault_type & VM_PROT_WRITE) != 0 && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_page_dirty(fs.m); vm_pager_page_unswapped(fs.m); } } /* * Page had better still be busy */ KASSERT(fs.m->oflags & VPO_BUSY, ("vm_fault: page %p not busy!", fs.m)); /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ KASSERT(fs.m->valid == VM_PAGE_BITS_ALL, ("vm_fault: page %p partially invalid", fs.m)); VM_OBJECT_UNLOCK(fs.object); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired); if ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 && wired == 0) vm_fault_prefault(fs.map->pmap, vaddr, fs.entry); VM_OBJECT_LOCK(fs.object); vm_page_lock(fs.m); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if (fault_flags & VM_FAULT_CHANGE_WIRING) { if (wired) vm_page_wire(fs.m); else vm_page_unwire(fs.m, 1); } else vm_page_activate(fs.m); if (m_hold != NULL) { *m_hold = fs.m; vm_page_hold(fs.m); } vm_page_unlock(fs.m); vm_page_wakeup(fs.m); /* * Unlock everything, and return */ unlock_and_deallocate(&fs); if (hardfault) curthread->td_ru.ru_majflt++; else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); }
/* * The map entries can *almost* be read with programs like cat. However, * large maps need special programs to read. It is not easy to implement * a program that can sense the required size of the buffer, and then * subsequently do a read with the appropriate size. This operation cannot * be atomic. The best that we can do is to allow the program to do a read * with an arbitrarily large buffer, and return as much as we can. We can * return an error code if the buffer is too small (EFBIG), then the program * can try a bigger buffer. */ int procfs_domap(struct proc *curp, struct lwp *lp, struct pfsnode *pfs, struct uio *uio) { struct proc *p = lp->lwp_proc; int len; struct vnode *vp; char *fullpath, *freepath; int error; vm_map_t map = &p->p_vmspace->vm_map; pmap_t pmap = vmspace_pmap(p->p_vmspace); vm_map_entry_t entry; char mebuffer[MEBUFFERSIZE]; if (uio->uio_rw != UIO_READ) return (EOPNOTSUPP); if (uio->uio_offset != 0) return (0); error = 0; vm_map_lock_read(map); for (entry = map->header.next; ((uio->uio_resid > 0) && (entry != &map->header)); entry = entry->next) { vm_object_t obj, tobj, lobj; int ref_count, shadow_count, flags; vm_offset_t addr; vm_offset_t ostart; int resident, privateresident; char *type; if (entry->maptype != VM_MAPTYPE_NORMAL && entry->maptype != VM_MAPTYPE_VPAGETABLE) { continue; } obj = entry->object.vm_object; if (obj) vm_object_hold(obj); if (obj && (obj->shadow_count == 1)) privateresident = obj->resident_page_count; else privateresident = 0; /* * Use map->hint as a poor man's ripout detector. */ map->hint = entry; ostart = entry->start; /* * Count resident pages (XXX can be horrible on 64-bit) */ resident = 0; addr = entry->start; while (addr < entry->end) { if (pmap_extract(pmap, addr)) resident++; addr += PAGE_SIZE; } if (obj) { lobj = obj; while ((tobj = lobj->backing_object) != NULL) { KKASSERT(tobj != obj); vm_object_hold(tobj); if (tobj == lobj->backing_object) { if (lobj != obj) { vm_object_lock_swap(); vm_object_drop(lobj); } lobj = tobj; } else { vm_object_drop(tobj); } } } else { lobj = NULL; } freepath = NULL; fullpath = "-"; if (lobj) { switch(lobj->type) { default: case OBJT_DEFAULT: type = "default"; vp = NULL; break; case OBJT_VNODE: type = "vnode"; vp = lobj->handle; vref(vp); break; case OBJT_SWAP: type = "swap"; vp = NULL; break; case OBJT_DEVICE: type = "device"; vp = NULL; break; } flags = obj->flags; ref_count = obj->ref_count; shadow_count = obj->shadow_count; if (vp != NULL) { vn_fullpath(p, vp, &fullpath, &freepath, 1); vrele(vp); } if (lobj != obj) vm_object_drop(lobj); } else { type = "none"; flags = 0; ref_count = 0; shadow_count = 0; } /* * format: * start, end, res, priv res, cow, access, type, (fullpath). */ ksnprintf(mebuffer, sizeof(mebuffer), #if LONG_BIT == 64 "0x%016lx 0x%016lx %d %d %p %s%s%s %d %d " #else "0x%08lx 0x%08lx %d %d %p %s%s%s %d %d " #endif "0x%04x %s %s %s %s\n", (u_long)entry->start, (u_long)entry->end, resident, privateresident, obj, (entry->protection & VM_PROT_READ)?"r":"-", (entry->protection & VM_PROT_WRITE)?"w":"-", (entry->protection & VM_PROT_EXECUTE)?"x":"-", ref_count, shadow_count, flags, (entry->eflags & MAP_ENTRY_COW)?"COW":"NCOW", (entry->eflags & MAP_ENTRY_NEEDS_COPY)?"NC":"NNC", type, fullpath); if (obj) vm_object_drop(obj); if (freepath != NULL) { kfree(freepath, M_TEMP); freepath = NULL; } len = strlen(mebuffer); if (len > uio->uio_resid) { error = EFBIG; break; } /* * We cannot safely hold the map locked while accessing * userspace as a VM fault might recurse the locked map. */ vm_map_unlock_read(map); error = uiomove(mebuffer, len, uio); vm_map_lock_read(map); if (error) break; /* * We use map->hint as a poor man's ripout detector. If * it does not match the entry we set it to prior to * unlocking the map the entry MIGHT now be stale. In * this case we do an expensive lookup to find our place * in the iteration again. */ if (map->hint != entry) { vm_map_entry_t reentry; vm_map_lookup_entry(map, ostart, &reentry); entry = reentry; } } vm_map_unlock_read(map); return error; }
int sys___msync13(struct lwp *l, const struct sys___msync13_args *uap, register_t *retval) { /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(int) flags; } */ struct proc *p = l->l_proc; vaddr_t addr; vsize_t size, pageoff; struct vm_map *map; int error, flags, uvmflags; bool rv; /* * extract syscall args from the uap */ addr = (vaddr_t)SCARG(uap, addr); size = (vsize_t)SCARG(uap, len); flags = SCARG(uap, flags); /* sanity check flags */ if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 || (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 || (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC)) return EINVAL; if ((flags & (MS_ASYNC | MS_SYNC)) == 0) flags |= MS_SYNC; /* * align the address to a page boundary and adjust the size accordingly. */ pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vsize_t)round_page(size); /* * get map */ map = &p->p_vmspace->vm_map; error = range_test(map, addr, size, false); if (error) return ENOMEM; /* * XXXCDC: do we really need this semantic? * * XXX Gak! If size is zero we are supposed to sync "all modified * pages with the region containing addr". Unfortunately, we * don't really keep track of individual mmaps so we approximate * by flushing the range of the map entry containing addr. * This can be incorrect if the region splits or is coalesced * with a neighbor. */ if (size == 0) { struct vm_map_entry *entry; vm_map_lock_read(map); rv = uvm_map_lookup_entry(map, addr, &entry); if (rv == true) { addr = entry->start; size = entry->end - entry->start; } vm_map_unlock_read(map); if (rv == false) return EINVAL; } /* * translate MS_ flags into PGO_ flags */ uvmflags = PGO_CLEANIT; if (flags & MS_INVALIDATE) uvmflags |= PGO_FREE; if (flags & MS_SYNC) uvmflags |= PGO_SYNCIO; error = uvm_map_clean(map, addr, addr+size, uvmflags); return error; }
/* ARGSUSED */ int sys_mincore(struct lwp *l, const struct sys_mincore_args *uap, register_t *retval) { /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(char *) vec; } */ struct proc *p = l->l_proc; struct vm_page *pg; char *vec, pgi; struct uvm_object *uobj; struct vm_amap *amap; struct vm_anon *anon; struct vm_map_entry *entry; vaddr_t start, end, lim; struct vm_map *map; vsize_t len; int error = 0, npgs; map = &p->p_vmspace->vm_map; start = (vaddr_t)SCARG(uap, addr); len = SCARG(uap, len); vec = SCARG(uap, vec); if (start & PAGE_MASK) return EINVAL; len = round_page(len); end = start + len; if (end <= start) return EINVAL; /* * Lock down vec, so our returned status isn't outdated by * storing the status byte for a page. */ npgs = len >> PAGE_SHIFT; error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE); if (error) { return error; } vm_map_lock_read(map); if (uvm_map_lookup_entry(map, start, &entry) == false) { error = ENOMEM; goto out; } for (/* nothing */; entry != &map->header && entry->start < end; entry = entry->next) { KASSERT(!UVM_ET_ISSUBMAP(entry)); KASSERT(start >= entry->start); /* Make sure there are no holes. */ if (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end)) { error = ENOMEM; goto out; } lim = end < entry->end ? end : entry->end; /* * Special case for objects with no "real" pages. Those * are always considered resident (mapped devices). */ if (UVM_ET_ISOBJ(entry)) { KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)); if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) { for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) subyte(vec, 1); continue; } } amap = entry->aref.ar_amap; /* upper layer */ uobj = entry->object.uvm_obj; /* lower layer */ if (amap != NULL) amap_lock(amap); if (uobj != NULL) mutex_enter(uobj->vmobjlock); for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) { pgi = 0; if (amap != NULL) { /* Check the upper layer first. */ anon = amap_lookup(&entry->aref, start - entry->start); /* Don't need to lock anon here. */ if (anon != NULL && anon->an_page != NULL) { /* * Anon has the page for this entry * offset. */ pgi = 1; } } if (uobj != NULL && pgi == 0) { /* Check the lower layer. */ pg = uvm_pagelookup(uobj, entry->offset + (start - entry->start)); if (pg != NULL) { /* * Object has the page for this entry * offset. */ pgi = 1; } } (void) subyte(vec, pgi); } if (uobj != NULL) mutex_exit(uobj->vmobjlock); if (amap != NULL) amap_unlock(amap); } out: vm_map_unlock_read(map); uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs); return error; }
/* * Original vm_pageout_oom, will be called if LRU pageout_oom will fail */ static void original_vm_pageout_oom(int shortage) { struct proc *p, *bigproc; vm_offset_t size, bigsize; struct thread *td; struct vmspace *vm; /* * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of * deadlock if process B is bigproc and one of it's child processes * attempts to propagate a signal to B while we are waiting for A's * lock while walking this list. To avoid this, we don't block on * the process lock but just skip a process if it is already locked. */ bigproc = NULL; bigsize = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { int breakout; if (PROC_TRYLOCK(p) == 0) continue; /* * If this is a system, protected or killed process, skip it. */ if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM)) || (p->p_pid == 1) || P_KILLED(p) || ((p->p_pid < 48) && (swap_pager_avail != 0))) { PROC_UNLOCK(p); continue; } /* * If the process is in a non-running type state, * don't touch it. Check all the threads individually. */ breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td)) { thread_unlock(td); breakout = 1; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get the process size */ vm = vmspace_acquire_ref(p); if (vm == NULL) { PROC_UNLOCK(p); continue; } if (!vm_map_trylock_read(&vm->vm_map)) { vmspace_free(vm); PROC_UNLOCK(p); continue; } size = vmspace_swap_count(vm); vm_map_unlock_read(&vm->vm_map); if (shortage == VM_OOM_MEM) size += vmspace_resident_count(vm); vmspace_free(vm); /* * if the this process is bigger than the biggest one * remember it. */ if (size > bigsize) { if (bigproc != NULL) PROC_UNLOCK(bigproc); bigproc = p; bigsize = size; } else PROC_UNLOCK(p); }
/* * The map entries can *almost* be read with programs like cat. However, * large maps need special programs to read. It is not easy to implement * a program that can sense the required size of the buffer, and then * subsequently do a read with the appropriate size. This operation cannot * be atomic. The best that we can do is to allow the program to do a read * with an arbitrarily large buffer, and return as much as we can. We can * return an error code if the buffer is too small (EFBIG), then the program * can try a bigger buffer. */ int procfs_domap(struct lwp *curl, struct proc *p, struct pfsnode *pfs, struct uio *uio, int linuxmode) { int error; struct vmspace *vm; struct vm_map *map; struct vm_map_entry *entry; char *buffer = NULL; size_t bufsize = BUFFERSIZE; char *path; struct vnode *vp; struct vattr va; dev_t dev; long fileid; size_t pos; int width = (int)((curl->l_proc->p_flag & PK_32) ? sizeof(int32_t) : sizeof(void *)) * 2; if (uio->uio_rw != UIO_READ) return EOPNOTSUPP; if (uio->uio_offset != 0) { /* * we return 0 here, so that the second read returns EOF * we don't support reading from an offset because the * map could have changed between the two reads. */ return 0; } error = 0; if (linuxmode != 0) path = malloc(MAXPATHLEN * 4, M_TEMP, M_WAITOK); else path = NULL; if ((error = proc_vmspace_getref(p, &vm)) != 0) goto out; map = &vm->vm_map; vm_map_lock_read(map); again: buffer = malloc(bufsize, M_TEMP, M_WAITOK); pos = 0; for (entry = map->header.next; entry != &map->header; entry = entry->next) { if (UVM_ET_ISSUBMAP(entry)) continue; if (linuxmode != 0) { *path = 0; dev = (dev_t)0; fileid = 0; if (UVM_ET_ISOBJ(entry) && UVM_OBJ_IS_VNODE(entry->object.uvm_obj)) { vp = (struct vnode *)entry->object.uvm_obj; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, curl->l_cred); VOP_UNLOCK(vp); if (error == 0 && vp != pfs->pfs_vnode) { fileid = va.va_fileid; dev = va.va_fsid; error = vnode_to_path(path, MAXPATHLEN * 4, vp, curl, p); } } pos += snprintf(buffer + pos, bufsize - pos, "%.*"PRIxVADDR"-%.*"PRIxVADDR" %c%c%c%c " "%.*lx %.2llx:%.2llx %-8ld %25.s %s\n", width, entry->start, width, entry->end, (entry->protection & VM_PROT_READ) ? 'r' : '-', (entry->protection & VM_PROT_WRITE) ? 'w' : '-', (entry->protection & VM_PROT_EXECUTE) ? 'x' : '-', (entry->etype & UVM_ET_COPYONWRITE) ? 'p' : 's', width, (unsigned long)entry->offset, (unsigned long long)major(dev), (unsigned long long)minor(dev), fileid, "", path); } else { pos += snprintf(buffer + pos, bufsize - pos, "%#"PRIxVADDR" %#"PRIxVADDR" " "%c%c%c %c%c%c %s %s %d %d %d\n", entry->start, entry->end, (entry->protection & VM_PROT_READ) ? 'r' : '-', (entry->protection & VM_PROT_WRITE) ? 'w' : '-', (entry->protection & VM_PROT_EXECUTE) ? 'x' : '-', (entry->max_protection & VM_PROT_READ) ? 'r' : '-', (entry->max_protection & VM_PROT_WRITE) ? 'w' : '-', (entry->max_protection & VM_PROT_EXECUTE) ? 'x' : '-', (entry->etype & UVM_ET_COPYONWRITE) ? "COW" : "NCOW", (entry->etype & UVM_ET_NEEDSCOPY) ? "NC" : "NNC", entry->inheritance, entry->wired_count, entry->advice); } if (pos >= bufsize) { bufsize <<= 1; if (bufsize > MAXBUFFERSIZE) { error = ENOMEM; vm_map_unlock_read(map); uvmspace_free(vm); goto out; } free(buffer, M_TEMP); goto again; } } vm_map_unlock_read(map); uvmspace_free(vm); error = uiomove(buffer, pos, uio); out: if (path != NULL) free(path, M_TEMP); if (buffer != NULL) free(buffer, M_TEMP); return error; }
int sys_msync(struct proc *p, void *v, register_t *retval) { struct sys_msync_args /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(int) flags; } */ *uap = v; vaddr_t addr; vsize_t size, pageoff; vm_map_t map; int rv, flags, uvmflags; /* * extract syscall args from the uap */ addr = (vaddr_t)SCARG(uap, addr); size = (vsize_t)SCARG(uap, len); flags = SCARG(uap, flags); /* sanity check flags */ if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 || (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 || (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC)) return (EINVAL); if ((flags & (MS_ASYNC | MS_SYNC)) == 0) flags |= MS_SYNC; /* * align the address to a page boundary, and adjust the size accordingly */ ALIGN_ADDR(addr, size, pageoff); if (addr > SIZE_MAX - size) return (EINVAL); /* disallow wrap-around. */ /* * get map */ map = &p->p_vmspace->vm_map; /* * XXXCDC: do we really need this semantic? * * XXX Gak! If size is zero we are supposed to sync "all modified * pages with the region containing addr". Unfortunately, we * don't really keep track of individual mmaps so we approximate * by flushing the range of the map entry containing addr. * This can be incorrect if the region splits or is coalesced * with a neighbor. */ if (size == 0) { vm_map_entry_t entry; vm_map_lock_read(map); rv = uvm_map_lookup_entry(map, addr, &entry); if (rv == TRUE) { addr = entry->start; size = entry->end - entry->start; } vm_map_unlock_read(map); if (rv == FALSE) return (EINVAL); } /* * translate MS_ flags into PGO_ flags */ uvmflags = PGO_CLEANIT; if (flags & MS_INVALIDATE) uvmflags |= PGO_FREE; if (flags & MS_SYNC) uvmflags |= PGO_SYNCIO; else uvmflags |= PGO_SYNCIO; /* XXXCDC: force sync for now! */ return (uvm_map_clean(map, addr, addr+size, uvmflags)); }
void debugger_getprocs_callback(struct allocation_t* ref) { if (!ref) return; struct message_t* message = __get(ref); if (!message) return; // Only handle requests if (message->header.request != 1) goto cleanup; int(*_sx_slock)(struct sx *sx, int opts, const char *file, int line) = kdlsym(_sx_slock); void(*_sx_sunlock)(struct sx *sx, const char *file, int line) = kdlsym(_sx_sunlock); void(*_mtx_lock_flags)(struct mtx *m, int opts, const char *file, int line) = kdlsym(_mtx_lock_flags); void(*_mtx_unlock_flags)(struct mtx *m, int opts, const char *file, int line) = kdlsym(_mtx_unlock_flags); struct sx* allproclock = (struct sx*)kdlsym(allproc_lock); struct proclist* allproc = (struct proclist*)*(uint64_t*)kdlsym(allproc); void(*vmspace_free)(struct vmspace *) = kdlsym(vmspace_free); struct vmspace* (*vmspace_acquire_ref)(struct proc *) = kdlsym(vmspace_acquire_ref); void(*_vm_map_lock_read)(vm_map_t map, const char *file, int line) = kdlsym(_vm_map_lock_read); void(*_vm_map_unlock_read)(vm_map_t map, const char *file, int line) = kdlsym(_vm_map_unlock_read); uint64_t procCount = 0; struct proc* p = NULL; struct debugger_getprocs_t getproc = { 0 }; sx_slock(allproclock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); // Zero out our process information kmemset(&getproc, 0, sizeof(getproc)); // Get the vm map struct vmspace* vm = vmspace_acquire_ref(p); vm_map_t map = &p->p_vmspace->vm_map; vm_map_lock_read(map); struct vm_map_entry* entry = map->header.next; // Copy over all of the address information getproc.process_id = p->p_pid; getproc.text_address = (uint64_t)entry->start; getproc.text_size = (uint64_t)entry->end - entry->start; getproc.data_address = (uint64_t)p->p_vmspace->vm_daddr; getproc.data_size = p->p_vmspace->vm_dsize; // Copy over the name and path kmemcpy(getproc.process_name, p->p_comm, sizeof(getproc.process_name)); kmemcpy(getproc.path, p->p_elfpath, sizeof(getproc.path)); // Write it back to the PC kwrite(message->socket, &getproc, sizeof(getproc)); procCount++; // Free the vmmap vm_map_unlock_read(map); vmspace_free(vm); PROC_UNLOCK(p); } sx_sunlock(allproclock); // Send finalizer, because f**k this shit kmemset(&getproc, 0xDD, sizeof(getproc)); kwrite(message->socket, &getproc, sizeof(getproc)); cleanup: __dec(ref); }
/* * The map entries can *almost* be read with programs like cat. However, * large maps need special programs to read. It is not easy to implement * a program that can sense the required size of the buffer, and then * subsequently do a read with the appropriate size. This operation cannot * be atomic. The best that we can do is to allow the program to do a read * with an arbitrarily large buffer, and return as much as we can. We can * return an error code if the buffer is too small (EFBIG), then the program * can try a bigger buffer. */ int procfs_doprocmap(PFS_FILL_ARGS) { struct vmspace *vm; vm_map_t map; vm_map_entry_t entry, tmp_entry; struct vnode *vp; char *fullpath, *freepath; struct uidinfo *uip; int error, vfslocked; unsigned int last_timestamp; #ifdef COMPAT_FREEBSD32 int wrap32 = 0; #endif PROC_LOCK(p); error = p_candebug(td, p); PROC_UNLOCK(p); if (error) return (error); if (uio->uio_rw != UIO_READ) return (EOPNOTSUPP); #ifdef COMPAT_FREEBSD32 if (curproc->p_sysent->sv_flags & SV_ILP32) { if (!(p->p_sysent->sv_flags & SV_ILP32)) return (EOPNOTSUPP); wrap32 = 1; } #endif vm = vmspace_acquire_ref(p); if (vm == NULL) return (ESRCH); map = &vm->vm_map; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { vm_object_t obj, tobj, lobj; int ref_count, shadow_count, flags; vm_offset_t e_start, e_end, addr; int resident, privateresident; char *type; vm_eflags_t e_eflags; vm_prot_t e_prot; if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; e_eflags = entry->eflags; e_prot = entry->protection; e_start = entry->start; e_end = entry->end; privateresident = 0; obj = entry->object.vm_object; if (obj != NULL) { VM_OBJECT_LOCK(obj); if (obj->shadow_count == 1) privateresident = obj->resident_page_count; } uip = (entry->uip) ? entry->uip : (obj ? obj->uip : NULL); resident = 0; addr = entry->start; while (addr < entry->end) { if (pmap_extract(map->pmap, addr)) resident++; addr += PAGE_SIZE; } for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) { if (tobj != obj) VM_OBJECT_LOCK(tobj); if (lobj != obj) VM_OBJECT_UNLOCK(lobj); lobj = tobj; } last_timestamp = map->timestamp; vm_map_unlock_read(map); freepath = NULL; fullpath = "-"; if (lobj) { switch (lobj->type) { default: case OBJT_DEFAULT: type = "default"; vp = NULL; break; case OBJT_VNODE: type = "vnode"; vp = lobj->handle; vref(vp); break; case OBJT_SWAP: type = "swap"; vp = NULL; break; case OBJT_SG: case OBJT_DEVICE: type = "device"; vp = NULL; break; } if (lobj != obj) VM_OBJECT_UNLOCK(lobj); flags = obj->flags; ref_count = obj->ref_count; shadow_count = obj->shadow_count; VM_OBJECT_UNLOCK(obj); if (vp != NULL) { vn_fullpath(td, vp, &fullpath, &freepath); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); } } else { type = "none"; flags = 0; ref_count = 0; shadow_count = 0; } /* * format: * start, end, resident, private resident, cow, access, type, * charged, charged uid. */ error = sbuf_printf(sb, "0x%lx 0x%lx %d %d %p %s%s%s %d %d 0x%x %s %s %s %s %s %d\n", (u_long)e_start, (u_long)e_end, resident, privateresident, #ifdef COMPAT_FREEBSD32 wrap32 ? NULL : obj, /* Hide 64 bit value */ #else obj, #endif (e_prot & VM_PROT_READ)?"r":"-", (e_prot & VM_PROT_WRITE)?"w":"-", (e_prot & VM_PROT_EXECUTE)?"x":"-", ref_count, shadow_count, flags, (e_eflags & MAP_ENTRY_COW)?"COW":"NCOW", (e_eflags & MAP_ENTRY_NEEDS_COPY)?"NC":"NNC", type, fullpath, uip ? "CH":"NCH", uip ? uip->ui_uid : -1); if (freepath != NULL) free(freepath, M_TEMP); vm_map_lock_read(map); if (error == -1) { error = 0; break; } if (last_timestamp != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. Specifically, * the entry may have been clipped, merged, or deleted. */ vm_map_lookup_entry(map, e_end - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); vmspace_free(vm); return (error); }
kern_return_t vm32_region_info_64( __DEBUG_ONLY vm_map_t map, __DEBUG_ONLY vm32_offset_t address, __DEBUG_ONLY vm_info_region_64_t *regionp, __DEBUG_ONLY vm_info_object_array_t *objectsp, __DEBUG_ONLY mach_msg_type_number_t *objectsCntp) { #if !MACH_VM_DEBUG return KERN_FAILURE; #else vm_map_copy_t copy; vm_offset_t addr = 0; /* memory for OOL data */ vm_size_t size; /* size of the memory */ unsigned int room; /* room for this many objects */ unsigned int used; /* actually this many objects */ vm_info_region_64_t region; kern_return_t kr; if (map == VM_MAP_NULL) return KERN_INVALID_TASK; size = 0; /* no memory allocated yet */ for (;;) { vm_map_t cmap; /* current map in traversal */ vm_map_t nmap; /* next map to look at */ vm_map_entry_t entry; vm_object_t object, cobject, nobject; /* nothing is locked */ vm_map_lock_read(map); for (cmap = map;; cmap = nmap) { /* cmap is read-locked */ if (!vm_map_lookup_entry(cmap, address, &entry)) { entry = entry->vme_next; if (entry == vm_map_to_entry(cmap)) { vm_map_unlock_read(cmap); if (size != 0) kmem_free(ipc_kernel_map, addr, size); return KERN_NO_SPACE; } } if (entry->is_sub_map) nmap = VME_SUBMAP(entry); else break; /* move down to the lower map */ vm_map_lock_read(nmap); vm_map_unlock_read(cmap); } /* cmap is read-locked; we have a real entry */ object = VME_OBJECT(entry); region.vir_start = (natural_t) entry->vme_start; region.vir_end = (natural_t) entry->vme_end; region.vir_object = (natural_t)(uintptr_t) object; region.vir_offset = VME_OFFSET(entry); region.vir_needs_copy = entry->needs_copy; region.vir_protection = entry->protection; region.vir_max_protection = entry->max_protection; region.vir_inheritance = entry->inheritance; region.vir_wired_count = entry->wired_count; region.vir_user_wired_count = entry->user_wired_count; used = 0; room = (unsigned int) (size / sizeof(vm_info_object_t)); if (object == VM_OBJECT_NULL) { vm_map_unlock_read(cmap); /* no memory needed */ break; } vm_object_lock(object); vm_map_unlock_read(cmap); for (cobject = object;; cobject = nobject) { /* cobject is locked */ if (used < room) { vm_info_object_t *vio = &((vm_info_object_t *) addr)[used]; vio->vio_object = (natural_t)(uintptr_t) cobject; vio->vio_size = (natural_t) cobject->vo_size; vio->vio_ref_count = cobject->ref_count; vio->vio_resident_page_count = cobject->resident_page_count; vio->vio_copy = (natural_t)(uintptr_t) cobject->copy; vio->vio_shadow = (natural_t)(uintptr_t) cobject->shadow; vio->vio_shadow_offset = (natural_t) cobject->vo_shadow_offset; vio->vio_paging_offset = (natural_t) cobject->paging_offset; vio->vio_copy_strategy = cobject->copy_strategy; vio->vio_last_alloc = (vm_offset_t) cobject->last_alloc; vio->vio_paging_in_progress = cobject->paging_in_progress + cobject->activity_in_progress; vio->vio_pager_created = cobject->pager_created; vio->vio_pager_initialized = cobject->pager_initialized; vio->vio_pager_ready = cobject->pager_ready; vio->vio_can_persist = cobject->can_persist; vio->vio_internal = cobject->internal; vio->vio_temporary = cobject->temporary; vio->vio_alive = cobject->alive; vio->vio_purgable = (cobject->purgable != VM_PURGABLE_DENY); vio->vio_purgable_volatile = (cobject->purgable == VM_PURGABLE_VOLATILE || cobject->purgable == VM_PURGABLE_EMPTY); } used++; nobject = cobject->shadow; if (nobject == VM_OBJECT_NULL) { vm_object_unlock(cobject); break; } vm_object_lock(nobject); vm_object_unlock(cobject); } /* nothing locked */ if (used <= room) break; /* must allocate more memory */ if (size != 0) kmem_free(ipc_kernel_map, addr, size); size = vm_map_round_page(2 * used * sizeof(vm_info_object_t), VM_MAP_PAGE_MASK(ipc_kernel_map)); kr = vm_allocate(ipc_kernel_map, &addr, size, VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_IPC)); if (kr != KERN_SUCCESS) return KERN_RESOURCE_SHORTAGE; kr = vm_map_wire( ipc_kernel_map, vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(ipc_kernel_map)), vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(ipc_kernel_map)), VM_PROT_READ|VM_PROT_WRITE, FALSE); assert(kr == KERN_SUCCESS); } /* free excess memory; make remaining memory pageable */ if (used == 0) { copy = VM_MAP_COPY_NULL; if (size != 0) kmem_free(ipc_kernel_map, addr, size); } else { vm_size_t size_used = (used * sizeof(vm_info_object_t)); vm_size_t vmsize_used = vm_map_round_page(size_used, VM_MAP_PAGE_MASK(ipc_kernel_map)); kr = vm_map_unwire( ipc_kernel_map, vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(ipc_kernel_map)), vm_map_round_page(addr + size_used, VM_MAP_PAGE_MASK(ipc_kernel_map)), FALSE); assert(kr == KERN_SUCCESS); kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)addr, (vm_map_size_t)size_used, TRUE, ©); assert(kr == KERN_SUCCESS); if (size != vmsize_used) kmem_free(ipc_kernel_map, addr + vmsize_used, size - vmsize_used); } *regionp = region; *objectsp = (vm_info_object_array_t) copy; *objectsCntp = used; return KERN_SUCCESS; #endif /* MACH_VM_DEBUG */ }
/* * The caller must hold proc_token. */ static int do_vmtotal_callback(struct proc *p, void *data) { struct vmtotal *totalp = data; struct lwp *lp; vm_map_entry_t entry; vm_map_t map; int paging; if (p->p_flag & P_SYSTEM) return(0); FOREACH_LWP_IN_PROC(lp, p) { switch (lp->lwp_stat) { case LSSTOP: case LSSLEEP: if ((p->p_flag & P_SWAPPEDOUT) == 0) { if ((lp->lwp_flag & LWP_SINTR) == 0) totalp->t_dw++; else if (lp->lwp_slptime < maxslp) totalp->t_sl++; } else if (lp->lwp_slptime < maxslp) { totalp->t_sw++; } if (lp->lwp_slptime >= maxslp) return(0); break; case LSRUN: if (p->p_flag & P_SWAPPEDOUT) totalp->t_sw++; else totalp->t_rq++; if (p->p_stat == SIDL) return(0); break; default: return (0); } } /* * Note active objects. */ paging = 0; lwkt_gettoken(&vm_token); if (p->p_vmspace) { map = &p->p_vmspace->vm_map; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { if (entry->maptype != VM_MAPTYPE_NORMAL && entry->maptype != VM_MAPTYPE_VPAGETABLE) { continue; } if (entry->object.vm_object == NULL) continue; vm_object_set_flag(entry->object.vm_object, OBJ_ACTIVE); paging |= entry->object.vm_object->paging_in_progress; } vm_map_unlock_read(map); } lwkt_reltoken(&vm_token); if (paging) totalp->t_pw++; return(0); }