/* * kmap_alloc_wait: * * Allocates pageable memory from a sub-map of the kernel. If the submap * has no room, the caller sleeps waiting for more memory in the submap. * * This routine may block. */ vm_offset_t kmap_alloc_wait(vm_map_t map, vm_size_t size) { vm_offset_t addr; size = round_page(size); if (!swap_reserve(size)) return (0); for (;;) { /* * To make this work for more than one map, use the map's lock * to lock out sleepers/wakers. */ vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), size, &addr) == 0) break; /* no space now; see if we can ever get space */ if (vm_map_max(map) - vm_map_min(map) < size) { vm_map_unlock(map); swap_release(size); return (0); } map->needs_wakeup = TRUE; vm_map_unlock_and_wait(map, 0); } vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, MAP_ACC_CHARGED); vm_map_unlock(map); return (addr); }
int get_vmmap_entries( vm_map_t map) { int total_entries = 0; vm_map_entry_t entry; if (not_in_kdp) vm_map_lock(map); entry = vm_map_first_entry(map); while(entry != vm_map_to_entry(map)) { if(entry->is_sub_map) { total_entries += get_vmsubmap_entries(entry->object.sub_map, entry->offset, entry->offset + (entry->vme_end - entry->vme_start)); } else { total_entries += 1; } entry = entry->vme_next; } if (not_in_kdp) vm_map_unlock(map); return(total_entries); }
/* * mlockall(int how) * * No requirements */ int sys_mlockall(struct mlockall_args *uap) { struct thread *td = curthread; struct proc *p = td->td_proc; vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry; int how = uap->how; int rc = KERN_SUCCESS; if (((how & MCL_CURRENT) == 0) && ((how & MCL_FUTURE) == 0)) return (EINVAL); rc = priv_check_cred(td->td_ucred, PRIV_ROOT, 0); if (rc) return (rc); vm_map_lock(map); do { if (how & MCL_CURRENT) { for(entry = map->header.next; entry != &map->header; entry = entry->next); rc = ENOSYS; break; } if (how & MCL_FUTURE) map->flags |= MAP_WIREFUTURE; } while(0); vm_map_unlock(map); return (rc); }
kern_return_t projected_buffer_deallocate( vm_map_t map, vm_offset_t start, vm_offset_t end) { vm_map_entry_t entry, k_entry; if (map == VM_MAP_NULL || map == kernel_map) return KERN_INVALID_ARGUMENT; vm_map_lock(map); if (!vm_map_lookup_entry(map, start, &entry) || end > entry->vme_end || /*Check corresponding kernel entry*/ (k_entry = entry->projected_on) == 0) { vm_map_unlock(map); return(KERN_INVALID_ARGUMENT); } /*Prepare for deallocation*/ if (entry->vme_start < start) _vm_map_clip_start(&map->hdr, entry, start); if (entry->vme_end > end) _vm_map_clip_end(&map->hdr, entry, end); if (map->first_free == entry) /*Adjust first_free hint*/ map->first_free = entry->vme_prev; entry->projected_on = 0; /*Needed to allow deletion*/ entry->wired_count = 0; /*Avoid unwire fault*/ vm_map_entry_delete(map, entry); vm_map_unlock(map); /*Check if the buffer is not persistent and only the kernel mapping remains, and if so delete it*/ vm_map_lock(kernel_map); if (k_entry->projected_on == (vm_map_entry_t) -1 && k_entry->object.vm_object->ref_count == 1) { if (kernel_map->first_free == k_entry) kernel_map->first_free = k_entry->vme_prev; k_entry->projected_on = 0; /*Allow unwire fault*/ vm_map_entry_delete(kernel_map, k_entry); } vm_map_unlock(kernel_map); return(KERN_SUCCESS); }
int sys_munmap(struct lwp *l, const struct sys_munmap_args *uap, register_t *retval) { /* { syscallarg(void *) addr; syscallarg(size_t) len; } */ struct proc *p = l->l_proc; vaddr_t addr; vsize_t size, pageoff; struct vm_map *map; struct vm_map_entry *dead_entries; int error; /* * get syscall args. */ addr = (vaddr_t)SCARG(uap, addr); size = (vsize_t)SCARG(uap, len); /* * align the address to a page boundary and adjust the size accordingly. */ pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vsize_t)round_page(size); if (size == 0) return (0); error = range_test(addr, size, false); if (error) return error; map = &p->p_vmspace->vm_map; /* * interesting system call semantic: make sure entire range is * allocated before allowing an unmap. */ vm_map_lock(map); #if 0 if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) { vm_map_unlock(map); return (EINVAL); } #endif uvm_unmap_remove(map, addr, addr + size, &dead_entries, NULL, 0); vm_map_unlock(map); if (dead_entries != NULL) uvm_unmap_detach(dead_entries, 0); return (0); }
/* * munlockall(void) * * Unwire all user-wired map entries, cancel MCL_FUTURE. * * No requirements */ int sys_munlockall(struct munlockall_args *uap) { struct thread *td = curthread; struct proc *p = td->td_proc; vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry; int rc = KERN_SUCCESS; vm_map_lock(map); /* Clear MAP_WIREFUTURE to cancel mlockall(MCL_FUTURE) */ map->flags &= ~MAP_WIREFUTURE; retry: for (entry = map->header.next; entry != &map->header; entry = entry->next) { if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) continue; /* * If we encounter an in-transition entry, we release the * map lock and retry the scan; we do not decrement any * wired_count more than once because we do not touch * any entries with MAP_ENTRY_USER_WIRED not set. * * There is a potential interleaving with concurrent * mlockall()s here -- if we abort a scan, an mlockall() * could start, wire a number of entries before our * current position in, and then stall itself on this * or any other in-transition entry. If that occurs, when * we resume, we will unwire those entries. */ if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; ++mycpu->gd_cnt.v_intrans_coll; ++mycpu->gd_cnt.v_intrans_wait; vm_map_transition_wait(map); goto retry; } KASSERT(entry->wired_count > 0, ("wired_count was 0 with USER_WIRED set! %p", entry)); /* Drop wired count, if it hits zero, unwire the entry */ entry->eflags &= ~MAP_ENTRY_USER_WIRED; entry->wired_count--; if (entry->wired_count == 0) vm_fault_unwire(map, entry); } map->timestamp++; vm_map_unlock(map); return (rc); }
/* * kmap_free_wakeup: * * Returns memory to a submap of the kernel, and wakes up any processes * waiting for memory in that map. */ void kmap_free_wakeup(vm_map_t map, vm_offset_t addr, vm_size_t size) { vm_map_lock(map); (void) vm_map_delete(map, trunc_page(addr), round_page(addr + size)); if (map->needs_wakeup) { map->needs_wakeup = FALSE; vm_map_wakeup(map); } vm_map_unlock(map); }
void uvm_km_free_wakeup(struct vm_map *map, vaddr_t addr, vsize_t size) { struct vm_map_entry *dead_entries; vm_map_lock(map); uvm_unmap_remove(map, trunc_page(addr), round_page(addr+size), &dead_entries, NULL); wakeup(map); vm_map_unlock(map); if (dead_entries != NULL) uvm_unmap_detach(dead_entries, 0); }
/* * kmem_init: * * Create the kernel map; insert a mapping covering kernel text, * data, bss, and all space allocated thus far (`boostrap' data). The * new map will thus map the range between VM_MIN_KERNEL_ADDRESS and * `start' as allocated, and the range between `start' and `end' as free. */ void kmem_init(vm_offset_t start, vm_offset_t end) { vm_map_t m; m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end); m->system_map = 1; vm_map_lock(m); /* N.B.: cannot use kgdb to debug, starting with this assignment ... */ kernel_map = m; (void) vm_map_insert(m, NULL, (vm_ooffset_t) 0, #ifdef __amd64__ KERNBASE, #else VM_MIN_KERNEL_ADDRESS, #endif start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); /* ... and ending with the completion of the above `insert' */ vm_map_unlock(m); }
kern_return_t kmem_alloc_aligned( vm_map_t map, vm_offset_t *addrp, vm_size_t size) { vm_map_entry_t entry; vm_offset_t offset; vm_offset_t addr; unsigned int attempts; kern_return_t kr; if ((size & (size - 1)) != 0) panic("kmem_alloc_aligned"); /* * Use the kernel object for wired-down kernel pages. * Assume that no region of the kernel object is * referenced more than once. We want vm_map_find_entry * to extend an existing entry if possible. */ size = round_page(size); attempts = 0; retry: vm_map_lock(map); kr = vm_map_find_entry(map, &addr, size, size - 1, kernel_object, &entry); if (kr != KERN_SUCCESS) { vm_map_unlock(map); if (attempts == 0) { attempts++; slab_collect(); goto retry; } printf_once("no more rooom for kmem_alloc_aligned in %p\n", map); return kr; } /* * Since we didn't know where the new region would * start, we couldn't supply the correct offset into * the kernel object. We only initialize the entry * if we aren't extending an existing entry. */ offset = addr - VM_MIN_KERNEL_ADDRESS; if (entry->object.vm_object == VM_OBJECT_NULL) { vm_object_reference(kernel_object); entry->object.vm_object = kernel_object; entry->offset = offset; } /* * Since we have not given out this address yet, * it is safe to unlock the map. */ vm_map_unlock(map); /* * Allocate wired-down memory in the kernel_object, * for this entry, and enter it in the kernel pmap. */ kmem_alloc_pages(kernel_object, offset, addr, addr + size, VM_PROT_DEFAULT); /* * Return the memory, not zeroed. */ *addrp = addr; return KERN_SUCCESS; }
int sys_mquery(struct proc *p, void *v, register_t *retval) { struct sys_mquery_args /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(int) prot; syscallarg(int) flags; syscallarg(int) fd; syscallarg(long) pad; syscallarg(off_t) pos; } */ *uap = v; struct file *fp; struct uvm_object *uobj; voff_t uoff; int error; vaddr_t vaddr; int flags = 0; vsize_t size; vm_prot_t prot; int fd; vaddr = (vaddr_t) SCARG(uap, addr); prot = SCARG(uap, prot); size = (vsize_t) SCARG(uap, len); fd = SCARG(uap, fd); if ((prot & VM_PROT_ALL) != prot) return (EINVAL); if (SCARG(uap, flags) & MAP_FIXED) flags |= UVM_FLAG_FIXED; if (fd >= 0) { if ((error = getvnode(p->p_fd, fd, &fp)) != 0) return (error); uobj = &((struct vnode *)fp->f_data)->v_uvm.u_obj; uoff = SCARG(uap, pos); } else { fp = NULL; uobj = NULL; uoff = 0; } if (vaddr == 0) vaddr = uvm_map_hint(p, prot); /* prevent a user requested address from falling in heap space */ if ((vaddr + size > (vaddr_t)p->p_vmspace->vm_daddr) && (vaddr < (vaddr_t)p->p_vmspace->vm_daddr + MAXDSIZ)) { if (flags & UVM_FLAG_FIXED) { error = EINVAL; goto done; } vaddr = round_page((vaddr_t)p->p_vmspace->vm_daddr + MAXDSIZ); } vm_map_lock(&p->p_vmspace->vm_map); again: if (uvm_map_findspace(&p->p_vmspace->vm_map, vaddr, size, &vaddr, uobj, uoff, 0, flags) == NULL) { if (flags & UVM_FLAG_FIXED) error = EINVAL; else error = ENOMEM; } else { /* prevent a returned address from falling in heap space */ if ((vaddr + size > (vaddr_t)p->p_vmspace->vm_daddr) && (vaddr < (vaddr_t)p->p_vmspace->vm_daddr + MAXDSIZ)) { vaddr = round_page((vaddr_t)p->p_vmspace->vm_daddr + MAXDSIZ); goto again; } error = 0; *retval = (register_t)(vaddr); } vm_map_unlock(&p->p_vmspace->vm_map); done: if (fp != NULL) FRELE(fp); return (error); }
int darwin_sys_load_shared_file(struct lwp *l, const struct darwin_sys_load_shared_file_args *uap, register_t *retval) { /* { syscallarg(char *) filename; syscallarg(void *) addr; syscallarg(u_long) len; syscallarg(void **) base; syscallarg(int) count: syscallarg(mach_sf_mapping_t *) mappings; syscallarg(int *) flags; } */ struct file *fp; struct vnode *vp = NULL; vaddr_t base; struct proc *p = l->l_proc; int flags; char *filename; mach_sf_mapping_t *mapp = NULL; size_t maplen; struct sys_open_args open_cup; struct sys_close_args close_cup; register_t fdc; int fd; int i; int error; vaddr_t max_addr, addr; size_t len; vaddr_t uaddr; int need_relocation; struct exec_vmcmd evc; filename = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); if ((error = copyin(SCARG(uap, filename), filename, MAXPATHLEN)) != 0) goto bad1; if ((error = copyin(SCARG(uap, base), &base, sizeof(base))) != 0) goto bad1; if ((error = copyin(SCARG(uap, flags), &flags, sizeof(base))) != 0) goto bad1; #ifdef DEBUG_DARWIN DPRINTF(("darwin_sys_load_shared_file: filename = %p ", SCARG(uap, filename))); DPRINTF(("addr = %p len = 0x%08lx base = %p ", SCARG(uap, addr), SCARG(uap, len), SCARG(uap, base))); DPRINTF(("count = %d mappings = %p flags = %p ", SCARG(uap, count), SCARG(uap, mappings), SCARG(uap, flags))); DPRINTF(("*base = 0x%08lx *flags = %d filename=`%s'\n", base, flags, filename)); #endif SCARG(&open_cup, path) = SCARG(uap, filename); SCARG(&open_cup, flags) = O_RDONLY; SCARG(&open_cup, mode) = 0; if ((error = sys_open(l, &open_cup, &fdc)) != 0) goto bad1; fd = (int)fdc; fp = fd_getfile(fd); if (fp == NULL) { error = EBADF; goto bad1point5; } vp = fp->f_data; vref(vp); if (SCARG(uap, count) < 0 || SCARG(uap, count) > PAGE_SIZE / sizeof(*mapp)) { error = EINVAL; goto bad2; } maplen = SCARG(uap, count) * sizeof(*mapp); mapp = malloc(maplen, M_TEMP, M_WAITOK); if ((error = copyin(SCARG(uap, mappings), mapp, maplen)) != 0) goto bad2; #ifdef DEBUG_DARWIN for (i = 0; i < SCARG(uap, count); i++) { DPRINTF(("mapp[%d].mapping_offset = 0x%08lx\n", i, mapp[i].mapping_offset)); DPRINTF(("mapp[%d].size = 0x%08lx\n", i, (long)mapp[i].size)); DPRINTF(("mapp[%d].file_offset = 0x%08lx\n", i, mapp[i].file_offset)); DPRINTF(("mapp[%d].protection = %d\n", i, mapp[i].protection)); DPRINTF(("mapp[%d].cksum = %ld\n", i, mapp[i].cksum)); } #endif /* Check if we can load at the default addresses */ need_relocation = 0; vm_map_lock(&p->p_vmspace->vm_map); for (i = 0; i < SCARG(uap, count); i++) if ((uvm_map_findspace(&p->p_vmspace->vm_map, base + mapp[i].mapping_offset, mapp[i].size, &uaddr, NULL, 0, 0, UVM_FLAG_FIXED)) == NULL) need_relocation = 1; vm_map_unlock(&p->p_vmspace->vm_map); /* If we cannot, we need a relocation */ if (need_relocation) { DPRINTF(("Relocating\n")); /* Compute the length of the region enclosing all sections */ max_addr = 0; for (i = 0; i < SCARG(uap, count); i++) { addr = (vaddr_t)(mapp[i].mapping_offset + base + mapp[i].size); if (addr > max_addr) max_addr = addr; } len = max_addr - base; DPRINTF(("base = 0x%08lx max_addr = 0x%08lx len = 0x%08x\n", base, max_addr, len)); /* Find some place to map this region */ vm_map_lock(&p->p_vmspace->vm_map); if ((uvm_map_findspace(&p->p_vmspace->vm_map, base, len, &uaddr, NULL, 0, PAGE_SIZE, 0)) == NULL) { DPRINTF(("Impossible to find some space\n")); vm_map_unlock(&p->p_vmspace->vm_map); error = ENOMEM; goto bad2; } vm_map_unlock(&p->p_vmspace->vm_map); /* Update the base address */ base = uaddr; DPRINTF(("New base address: base = 0x%08lx\n", base)); } /* Do the actual mapping */ for (i = 0; i < SCARG(uap, count); i++) { bzero(&evc, sizeof(evc)); evc.ev_addr = base + mapp[i].mapping_offset; evc.ev_len = mapp[i].size; evc.ev_prot = mapp[i].protection & VM_PROT_ALL; evc.ev_flags = 0; if (mapp[i].protection & MACH_VM_PROT_ZF) evc.ev_proc = vmcmd_map_zero; else evc.ev_proc = vmcmd_map_readvn; evc.ev_offset = mapp[i].file_offset; evc.ev_vp = vp; DPRINTF(("map section %d: start = 0x%08lx, len = 0x%08lx\n", i, evc.ev_addr, evc.ev_len)); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if ((error = (*evc.ev_proc)(l, &evc)) != 0) { VOP_UNLOCK(vp, 0); DPRINTF(("Failed\n")); goto bad2; } VOP_UNLOCK(vp, 0); DPRINTF(("Success\n")); } bad2: if (mapp) free(mapp, M_TEMP); vrele(vp); fd_putfile(fd); bad1point5: SCARG(&close_cup, fd) = fd; if ((error = sys_close(l, &close_cup, retval)) != 0) goto bad1; if ((error = copyout(&base, SCARG(uap, base), sizeof(base))) != 0) goto bad1; if ((error = copyout(&flags, SCARG(uap, flags), sizeof(base))) != 0) goto bad1; bad1: free(filename, M_TEMP); return error; }
int uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, int advice, struct uvm_object *uobj, voff_t foff, vsize_t locklimit) { vaddr_t align = 0; int error; uvm_flag_t uvmflag = 0; /* * check params */ if (size == 0) return 0; if (foff & PAGE_MASK) return EINVAL; if ((prot & maxprot) != prot) return EINVAL; /* * for non-fixed mappings, round off the suggested address. * for fixed mappings, check alignment and zap old mappings. */ if ((flags & MAP_FIXED) == 0) { *addr = round_page(*addr); } else { if (*addr & PAGE_MASK) return EINVAL; uvmflag |= UVM_FLAG_FIXED; (void) uvm_unmap(map, *addr, *addr + size); } /* * Try to see if any requested alignment can even be attemped. * Make sure we can express the alignment (asking for a >= 4GB * alignment on an ILP32 architecure make no sense) and the * alignment is at least for a page sized quanitiy. If the * request was for a fixed mapping, make sure supplied address * adheres to the request alignment. */ align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT; if (align) { if (align >= sizeof(vaddr_t) * NBBY) return EINVAL; align = 1L << align; if (align < PAGE_SIZE) return EINVAL; if (align >= vm_map_max(map)) return ENOMEM; if (flags & MAP_FIXED) { if ((*addr & (align-1)) != 0) return EINVAL; align = 0; } } /* * check resource limits */ if (!VM_MAP_IS_KERNEL(map) && (((rlim_t)curproc->p_vmspace->vm_map.size + (rlim_t)size) > curproc->p_rlimit[RLIMIT_AS].rlim_cur)) return ENOMEM; /* * handle anon vs. non-anon mappings. for non-anon mappings attach * to underlying vm object. */ if (flags & MAP_ANON) { KASSERT(uobj == NULL); foff = UVM_UNKNOWN_OFFSET; if ((flags & MAP_SHARED) == 0) /* XXX: defer amap create */ uvmflag |= UVM_FLAG_COPYONW; else /* shared: create amap now */ uvmflag |= UVM_FLAG_OVERLAY; } else { KASSERT(uobj != NULL); if ((flags & MAP_SHARED) == 0) { uvmflag |= UVM_FLAG_COPYONW; } } uvmflag = UVM_MAPFLAG(prot, maxprot, (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY, advice, uvmflag); error = uvm_map(map, addr, size, uobj, foff, align, uvmflag); if (error) { if (uobj) uobj->pgops->pgo_detach(uobj); return error; } /* * POSIX 1003.1b -- if our address space was configured * to lock all future mappings, wire the one we just made. * * Also handle the MAP_WIRED flag here. */ if (prot == VM_PROT_NONE) { /* * No more work to do in this case. */ return 0; } if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) { vm_map_lock(map); if (atop(size) + uvmexp.wired > uvmexp.wiredmax || (locklimit != 0 && size + ptoa(pmap_wired_count(vm_map_pmap(map))) > locklimit)) { vm_map_unlock(map); uvm_unmap(map, *addr, *addr + size); return ENOMEM; } /* * uvm_map_pageable() always returns the map unlocked. */ error = uvm_map_pageable(map, *addr, *addr + size, false, UVM_LK_ENTER); if (error) { uvm_unmap(map, *addr, *addr + size); return error; } return 0; } return 0; }
int sys_munmap(struct proc *p, void *v, register_t *retval) { struct sys_munmap_args /* { syscallarg(void *) addr; syscallarg(size_t) len; } */ *uap = v; vaddr_t addr; vsize_t size, pageoff; vm_map_t map; vaddr_t vm_min_address = VM_MIN_ADDRESS; struct vm_map_entry *dead_entries; /* * get syscall args... */ addr = (vaddr_t) SCARG(uap, addr); size = (vsize_t) SCARG(uap, len); /* * align the address to a page boundary, and adjust the size accordingly */ ALIGN_ADDR(addr, size, pageoff); /* * Check for illegal addresses. Watch out for address wrap... * Note that VM_*_ADDRESS are not constants due to casts (argh). */ if (addr > SIZE_MAX - size) return (EINVAL); if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) return (EINVAL); if (vm_min_address > 0 && addr < vm_min_address) return (EINVAL); map = &p->p_vmspace->vm_map; vm_map_lock(map); /* lock map so we can checkprot */ /* * interesting system call semantic: make sure entire range is * allocated before allowing an unmap. */ if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) { vm_map_unlock(map); return (EINVAL); } /* * doit! */ uvm_unmap_remove(map, addr, addr + size, &dead_entries, p); vm_map_unlock(map); /* and unlock */ if (dead_entries != NULL) uvm_unmap_detach(dead_entries, 0); return (0); }
int uvm_mmap(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, caddr_t handle, voff_t foff, vsize_t locklimit, struct proc *p) { struct uvm_object *uobj; struct vnode *vp; int error; int advice = UVM_ADV_NORMAL; uvm_flag_t uvmflag = 0; vsize_t align = 0; /* userland page size */ /* * check params */ if (size == 0) return(0); if (foff & PAGE_MASK) return(EINVAL); if ((prot & maxprot) != prot) return(EINVAL); /* * for non-fixed mappings, round off the suggested address. * for fixed mappings, check alignment and zap old mappings. */ if ((flags & MAP_FIXED) == 0) { *addr = round_page(*addr); /* round */ } else { if (*addr & PAGE_MASK) return(EINVAL); uvmflag |= UVM_FLAG_FIXED; uvm_unmap_p(map, *addr, *addr + size, p); /* zap! */ } /* * handle anon vs. non-anon mappings. for non-anon mappings attach * to underlying vm object. */ if (flags & MAP_ANON) { if ((flags & MAP_FIXED) == 0 && size >= __LDPGSZ) align = __LDPGSZ; foff = UVM_UNKNOWN_OFFSET; uobj = NULL; if ((flags & MAP_SHARED) == 0) /* XXX: defer amap create */ uvmflag |= UVM_FLAG_COPYONW; else /* shared: create amap now */ uvmflag |= UVM_FLAG_OVERLAY; } else { vp = (struct vnode *) handle; /* get vnode */ if (vp->v_type != VCHR) { uobj = uvn_attach((void *) vp, (flags & MAP_SHARED) ? maxprot : (maxprot & ~VM_PROT_WRITE)); if (uobj) { assert((void*)uobj == vp); if (flags & MAP_DENYWRITE) uvmflag |= UVM_FLAG_DENYWRITE; if ((flags & MAP_SHARED) && (maxprot & VM_PROT_WRITE)) uvmflag |= UVM_FLAG_WRITECOUNT; } #ifndef UBC /* * XXXCDC: hack from old code * don't allow vnodes which have been mapped * shared-writeable to persist [forces them to be * flushed out when last reference goes]. * XXXCDC: interesting side effect: avoids a bug. * note that in WRITE [ufs_readwrite.c] that we * allocate buffer, uncache, and then do the write. * the problem with this is that if the uncache causes * VM data to be flushed to the same area of the file * we are writing to... in that case we've got the * buffer locked and our process goes to sleep forever. * * XXXCDC: checking maxprot protects us from the * "persistbug" program but this is not a long term * solution. * * XXXCDC: we don't bother calling uncache with the vp * VOP_LOCKed since we know that we are already * holding a valid reference to the uvn (from the * uvn_attach above), and thus it is impossible for * the uncache to kill the uvn and trigger I/O. */ if (flags & MAP_SHARED) { if ((prot & VM_PROT_WRITE) || (maxprot & VM_PROT_WRITE)) { uvm_vnp_uncache(vp); } } #else /* XXX for now, attach doesn't gain a ref */ VREF(vp); #endif } else { uobj = udv_attach((void *) &vp->v_rdev, (flags & MAP_SHARED) ? maxprot : (maxprot & ~VM_PROT_WRITE), foff, size); /* * XXX Some devices don't like to be mapped with * XXX PROT_EXEC, but we don't really have a * XXX better way of handling this, right now */ if (uobj == NULL && (prot & PROT_EXEC) == 0) { maxprot &= ~VM_PROT_EXECUTE; uobj = udv_attach((void *) &vp->v_rdev, (flags & MAP_SHARED) ? maxprot : (maxprot & ~VM_PROT_WRITE), foff, size); } advice = UVM_ADV_RANDOM; } if (uobj == NULL) return((vp->v_type == VREG) ? ENOMEM : EINVAL); if ((flags & MAP_SHARED) == 0) uvmflag |= UVM_FLAG_COPYONW; } /* * set up mapping flags */ uvmflag = UVM_MAPFLAG(prot, maxprot, (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY, advice, uvmflag); error = uvm_map_p(map, addr, size, uobj, foff, align, uvmflag, p); if (error == 0) { /* * POSIX 1003.1b -- if our address space was configured * to lock all future mappings, wire the one we just made. */ if (prot == VM_PROT_NONE) { /* * No more work to do in this case. */ return (0); } vm_map_lock(map); if (map->flags & VM_MAP_WIREFUTURE) { if ((atop(size) + uvmexp.wired) > uvmexp.wiredmax #ifdef pmap_wired_count || (locklimit != 0 && (size + ptoa(pmap_wired_count(vm_map_pmap(map)))) > locklimit) #endif ) { error = ENOMEM; vm_map_unlock(map); /* unmap the region! */ uvm_unmap(map, *addr, *addr + size); goto bad; } /* * uvm_map_pageable() always returns the map * unlocked. */ error = uvm_map_pageable(map, *addr, *addr + size, FALSE, UVM_LK_ENTER); if (error != 0) { /* unmap the region! */ uvm_unmap(map, *addr, *addr + size); goto bad; } return (0); } vm_map_unlock(map); return (0); } /* * errors: first detach from the uobj, if any. */ if (uobj) uobj->pgops->pgo_detach(uobj); bad: return (error); }
int uvm_io(vm_map_t map, struct uio *uio, int flags) { vaddr_t baseva, endva, pageoffset, kva; vsize_t chunksz, togo, sz; struct uvm_map_deadq dead_entries; int error, extractflags; /* * step 0: sanity checks and set up for copy loop. start with a * large chunk size. if we have trouble finding vm space we will * reduce it. */ if (uio->uio_resid == 0) return(0); togo = uio->uio_resid; baseva = (vaddr_t) uio->uio_offset; endva = baseva + (togo - 1); if (endva < baseva) /* wrap around? */ return(EIO); if (baseva >= VM_MAXUSER_ADDRESS) return(0); if (endva >= VM_MAXUSER_ADDRESS) /* EOF truncate */ togo = togo - (endva - VM_MAXUSER_ADDRESS + 1); pageoffset = baseva & PAGE_MASK; baseva = trunc_page(baseva); chunksz = min(round_page(togo + pageoffset), MAXBSIZE); error = 0; extractflags = 0; if (flags & UVM_IO_FIXPROT) extractflags |= UVM_EXTRACT_FIXPROT; /* step 1: main loop... while we've got data to move */ for (/*null*/; togo > 0 ; pageoffset = 0) { /* step 2: extract mappings from the map into kernel_map */ error = uvm_map_extract(map, baseva, chunksz, &kva, extractflags); if (error) { /* retry with a smaller chunk... */ if (error == ENOMEM && chunksz > PAGE_SIZE) { chunksz = trunc_page(chunksz / 2); if (chunksz < PAGE_SIZE) chunksz = PAGE_SIZE; continue; } break; } /* step 3: move a chunk of data */ sz = chunksz - pageoffset; if (sz > togo) sz = togo; error = uiomove((caddr_t) (kva + pageoffset), sz, uio); togo -= sz; baseva += chunksz; /* step 4: unmap the area of kernel memory */ vm_map_lock(kernel_map); TAILQ_INIT(&dead_entries); uvm_unmap_remove(kernel_map, kva, kva+chunksz, &dead_entries, FALSE, TRUE); vm_map_unlock(kernel_map); uvm_unmap_detach(&dead_entries, AMAP_REFALL); /* * We defer checking the error return from uiomove until * here so that we won't leak memory. */ if (error) break; } return (error); }
/* * kmem_realloc: * * Reallocate wired-down memory in the kernel's address map * or a submap. Newly allocated pages are not zeroed. * This can only be used on regions allocated with kmem_alloc. * * If successful, the pages in the old region are mapped twice. * The old region is unchanged. Use kmem_free to get rid of it. */ kern_return_t kmem_realloc( vm_map_t map, vm_offset_t oldaddr, vm_size_t oldsize, vm_offset_t *newaddrp, vm_size_t newsize) { vm_offset_t oldmin, oldmax; vm_offset_t newaddr; vm_object_t object; vm_map_entry_t oldentry, newentry; unsigned int attempts; kern_return_t kr; oldmin = trunc_page(oldaddr); oldmax = round_page(oldaddr + oldsize); oldsize = oldmax - oldmin; newsize = round_page(newsize); /* * Find space for the new region. */ attempts = 0; retry: vm_map_lock(map); kr = vm_map_find_entry(map, &newaddr, newsize, (vm_offset_t) 0, VM_OBJECT_NULL, &newentry); if (kr != KERN_SUCCESS) { vm_map_unlock(map); if (attempts == 0) { attempts++; slab_collect(); goto retry; } printf_once("no more room for kmem_realloc in %p\n", map); return kr; } /* * Find the VM object backing the old region. */ if (!vm_map_lookup_entry(map, oldmin, &oldentry)) panic("kmem_realloc"); object = oldentry->object.vm_object; /* * Increase the size of the object and * fill in the new region. */ vm_object_reference(object); vm_object_lock(object); if (object->size != oldsize) panic("kmem_realloc"); object->size = newsize; vm_object_unlock(object); newentry->object.vm_object = object; newentry->offset = 0; /* * Since we have not given out this address yet, * it is safe to unlock the map. We are trusting * that nobody will play with either region. */ vm_map_unlock(map); /* * Remap the pages in the old region and * allocate more pages for the new region. */ kmem_remap_pages(object, 0, newaddr, newaddr + oldsize, VM_PROT_DEFAULT); kmem_alloc_pages(object, oldsize, newaddr + oldsize, newaddr + newsize, VM_PROT_DEFAULT); *newaddrp = newaddr; return KERN_SUCCESS; }
kern_return_t kmem_alloc( vm_map_t map, vm_offset_t *addrp, vm_size_t size) { vm_object_t object; vm_map_entry_t entry; vm_offset_t addr; unsigned int attempts; kern_return_t kr; /* * Allocate a new object. We must do this before locking * the map, lest we risk deadlock with the default pager: * device_read_alloc uses kmem_alloc, * which tries to allocate an object, * which uses kmem_alloc_wired to get memory, * which blocks for pages. * then the default pager needs to read a block * to process a memory_object_data_write, * and device_read_alloc calls kmem_alloc * and deadlocks on the map lock. */ size = round_page(size); object = vm_object_allocate(size); attempts = 0; retry: vm_map_lock(map); kr = vm_map_find_entry(map, &addr, size, (vm_offset_t) 0, VM_OBJECT_NULL, &entry); if (kr != KERN_SUCCESS) { vm_map_unlock(map); if (attempts == 0) { attempts++; slab_collect(); goto retry; } printf_once("no more room for kmem_alloc in %p\n", map); vm_object_deallocate(object); return kr; } entry->object.vm_object = object; entry->offset = 0; /* * Since we have not given out this address yet, * it is safe to unlock the map. */ vm_map_unlock(map); /* * Allocate wired-down memory in the kernel_object, * for this entry, and enter it in the kernel pmap. */ kmem_alloc_pages(object, 0, addr, addr + size, VM_PROT_DEFAULT); /* * Return the memory, not zeroed. */ *addrp = addr; return KERN_SUCCESS; }
kern_return_t projected_buffer_map( vm_map_t map, vm_offset_t kernel_addr, vm_size_t size, vm_offset_t *user_p, vm_prot_t protection, vm_inherit_t inheritance) /*Currently only VM_INHERIT_NONE supported*/ { vm_map_entry_t u_entry, k_entry; vm_offset_t physical_addr, user_addr; vm_size_t r_size; kern_return_t kr; /* * Find entry in kernel map */ size = round_page(size); if (map == VM_MAP_NULL || map == kernel_map || !vm_map_lookup_entry(kernel_map, kernel_addr, &k_entry) || kernel_addr + size > k_entry->vme_end) return(KERN_INVALID_ARGUMENT); /* * Create entry in user task */ vm_map_lock(map); kr = vm_map_find_entry(map, &user_addr, size, (vm_offset_t) 0, VM_OBJECT_NULL, &u_entry); if (kr != KERN_SUCCESS) { vm_map_unlock(map); return kr; } u_entry->object.vm_object = k_entry->object.vm_object; vm_object_reference(k_entry->object.vm_object); u_entry->offset = kernel_addr - k_entry->vme_start + k_entry->offset; u_entry->projected_on = k_entry; /*Creates coupling with kernel mapping of the buffer, and also guarantees that user cannot directly manipulate buffer VM entry*/ u_entry->protection = protection; u_entry->max_protection = protection; u_entry->inheritance = inheritance; u_entry->wired_count = k_entry->wired_count; vm_map_unlock(map); *user_p = user_addr; /* Set up physical mappings for user pmap */ pmap_pageable(map->pmap, user_addr, user_addr + size, !k_entry->wired_count); for (r_size = 0; r_size < size; r_size += PAGE_SIZE) { physical_addr = pmap_extract(kernel_pmap, kernel_addr + r_size); pmap_enter(map->pmap, user_addr + r_size, physical_addr, protection, k_entry->wired_count); } return(KERN_SUCCESS); }
/* * vm_contig_pg_kmap: * * Map previously allocated (vm_contig_pg_alloc) range of pages from * vm_page_array[] into the KVA. Once mapped, the pages are part of * the Kernel, and are to free'ed with kmem_free(&kernel_map, addr, size). * * No requirements. */ vm_offset_t vm_contig_pg_kmap(int start, u_long size, vm_map_t map, int flags) { vm_offset_t addr, tmp_addr; vm_page_t pga = vm_page_array; int i, count; size = round_page(size); if (size == 0) panic("vm_contig_pg_kmap: size must not be 0"); crit_enter(); lwkt_gettoken(&vm_token); /* * We've found a contiguous chunk that meets our requirements. * Allocate KVM, and assign phys pages and return a kernel VM * pointer. */ count = vm_map_entry_reserve(MAP_RESERVE_COUNT); vm_map_lock(map); if (vm_map_findspace(map, vm_map_min(map), size, PAGE_SIZE, 0, &addr) != KERN_SUCCESS) { /* * XXX We almost never run out of kernel virtual * space, so we don't make the allocated memory * above available. */ vm_map_unlock(map); vm_map_entry_release(count); lwkt_reltoken(&vm_token); crit_exit(); return (0); } /* * kernel_object maps 1:1 to kernel_map. */ vm_object_hold(&kernel_object); vm_object_reference(&kernel_object); vm_map_insert(map, &count, &kernel_object, addr, addr, addr + size, VM_MAPTYPE_NORMAL, VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); vm_map_entry_release(count); tmp_addr = addr; for (i = start; i < (start + size / PAGE_SIZE); i++) { vm_page_t m = &pga[i]; vm_page_insert(m, &kernel_object, OFF_TO_IDX(tmp_addr)); if ((flags & M_ZERO) && !(m->flags & PG_ZERO)) pmap_zero_page(VM_PAGE_TO_PHYS(m)); m->flags = 0; tmp_addr += PAGE_SIZE; } vm_map_wire(map, addr, addr + size, 0); vm_object_drop(&kernel_object); lwkt_reltoken(&vm_token); crit_exit(); return (addr); }
/* * Destroy old address space, and allocate a new stack. * The new stack is only sgrowsiz large because it is grown * automatically on a page fault. */ int exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv) { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_object_t obj; struct rlimit rlim_stack; vm_offset_t sv_minuser, stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser && cpu_exec_vmspace_reuse(p, map)) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); /* * An exec terminates mlockall(MCL_FUTURE), ASLR state * must be re-evaluated. */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE | MAP_ASLR | MAP_ASLR_IGNSTART); vm_map_unlock(map); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } map->flags |= imgp->map_flags; /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj != NULL) { vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, sv->sv_shared_page_base, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); if (error != KERN_SUCCESS) { vm_object_deallocate(obj); return (vm_mmap_to_errno(error)); } } /* Allocate a new stack */ if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error != KERN_SUCCESS) return (vm_mmap_to_errno(error)); /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)stack_addr; return (0); }
kern_return_t projected_buffer_allocate( vm_map_t map, vm_size_t size, int persistence, vm_offset_t *kernel_p, vm_offset_t *user_p, vm_prot_t protection, vm_inherit_t inheritance) /*Currently only VM_INHERIT_NONE supported*/ { vm_object_t object; vm_map_entry_t u_entry, k_entry; vm_offset_t addr; vm_size_t r_size; kern_return_t kr; if (map == VM_MAP_NULL || map == kernel_map) return(KERN_INVALID_ARGUMENT); /* * Allocate a new object. */ size = round_page(size); object = vm_object_allocate(size); vm_map_lock(kernel_map); kr = vm_map_find_entry(kernel_map, &addr, size, (vm_offset_t) 0, VM_OBJECT_NULL, &k_entry); if (kr != KERN_SUCCESS) { vm_map_unlock(kernel_map); vm_object_deallocate(object); return kr; } k_entry->object.vm_object = object; if (!persistence) k_entry->projected_on = (vm_map_entry_t) -1; /*Mark entry so as to automatically deallocate it when last corresponding user entry is deallocated*/ vm_map_unlock(kernel_map); *kernel_p = addr; vm_map_lock(map); kr = vm_map_find_entry(map, &addr, size, (vm_offset_t) 0, VM_OBJECT_NULL, &u_entry); if (kr != KERN_SUCCESS) { vm_map_unlock(map); vm_map_lock(kernel_map); vm_map_entry_delete(kernel_map, k_entry); vm_map_unlock(kernel_map); vm_object_deallocate(object); return kr; } u_entry->object.vm_object = object; vm_object_reference(object); u_entry->projected_on = k_entry; /*Creates coupling with kernel mapping of the buffer, and also guarantees that user cannot directly manipulate buffer VM entry*/ u_entry->protection = protection; u_entry->max_protection = protection; u_entry->inheritance = inheritance; vm_map_unlock(map); *user_p = addr; /* * Allocate wired-down memory in the object, * and enter it in the kernel pmap. */ kmem_alloc_pages(object, 0, *kernel_p, *kernel_p + size, VM_PROT_READ | VM_PROT_WRITE); memset((void*) *kernel_p, 0, size); /*Zero fill*/ /* Set up physical mappings for user pmap */ pmap_pageable(map->pmap, *user_p, *user_p + size, FALSE); for (r_size = 0; r_size < size; r_size += PAGE_SIZE) { addr = pmap_extract(kernel_pmap, *kernel_p + r_size); pmap_enter(map->pmap, *user_p + r_size, addr, protection, TRUE); } return(KERN_SUCCESS); }
int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { vm_prot_t prot; long ahead, behind; int alloc_req, era, faultcount, nera, reqpage, result; boolean_t growstack, is_first_object_locked, wired; int map_generation; vm_object_t next_object; vm_page_t marray[VM_FAULT_READ_MAX]; int hardfault; struct faultstate fs; struct vnode *vp; int locked, error; hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); fs.vp = NULL; fs.vfslocked = 0; faultcount = reqpage = 0; RetryFault:; /* * Find the backing store object and offset into it to begin the * search. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { if (growstack && result == KERN_INVALID_ADDRESS && map != kernel_map) { result = vm_map_growstack(curproc, vaddr); if (result != KERN_SUCCESS) return (KERN_FAILURE); growstack = FALSE; goto RetryFault; } return (result); } map_generation = fs.map->timestamp; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { if ((curthread->td_pflags & TDP_DEVMEMIO) != 0) { vm_map_unlock_read(fs.map); return (KERN_FAILURE); } panic("vm_fault: fault on nofault entry, addr: %lx", (u_long)vaddr); } if (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION && fs.entry->wiring_thread != curthread) { vm_map_unlock_read(fs.map); vm_map_lock(fs.map); if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) && (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) { fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(fs.map, 0); } else vm_map_unlock(fs.map); goto RetryFault; } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. This must be done after * obtaining the vnode lock in order to avoid possible deadlocks. */ VM_OBJECT_LOCK(fs.first_object); vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.lookup_still_valid = TRUE; if (wired) fault_type = prot | (fault_type & VM_PROT_COPY); fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { /* * If the object is dead, we stop here */ if (fs.object->flags & OBJ_DEAD) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { /* * check for page-based copy on write. * We check fs.object == fs.first_object so * as to ensure the legacy COW mechanism is * used when the page in question is part of * a shadow object. Otherwise, vm_page_cowfault() * removes the page from the backing object, * which is not what we want. */ vm_page_lock(fs.m); if ((fs.m->cow) && (fault_type & VM_PROT_WRITE) && (fs.object == fs.first_object)) { vm_page_cowfault(fs.m); unlock_and_deallocate(&fs); goto RetryFault; } /* * Wait/Retry if the page is busy. We have to do this * if the page is busy via either VPO_BUSY or * vm_page_t->busy because the vm_pager may be using * vm_page_t->busy for pageouts ( and even pageins if * it is the vnode pager ), and we could end up trying * to pagein and pageout the same page simultaneously. * * We can theoretically allow the busy case on a read * fault if the page is marked valid, but since such * pages are typically already pmap'd, putting that * special case in might be more effort then it is * worth. We cannot under any circumstances mess * around with a vm_page_t->busy page except, perhaps, * to pmap it. */ if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs.m, PGA_REFERENCED); vm_page_unlock(fs.m); if (fs.object != fs.first_object) { if (!VM_OBJECT_TRYLOCK( fs.first_object)) { VM_OBJECT_UNLOCK(fs.object); VM_OBJECT_LOCK(fs.first_object); VM_OBJECT_LOCK(fs.object); } vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); vm_object_pip_wakeup(fs.first_object); VM_OBJECT_UNLOCK(fs.first_object); fs.first_m = NULL; } unlock_map(&fs); if (fs.m == vm_page_lookup(fs.object, fs.pindex)) { vm_page_sleep_if_busy(fs.m, TRUE, "vmpfw"); } vm_object_pip_wakeup(fs.object); VM_OBJECT_UNLOCK(fs.object); PCPU_INC(cnt.v_intrans); vm_object_deallocate(fs.first_object); goto RetryFault; } vm_pageq_remove(fs.m); vm_page_unlock(fs.m); /* * Mark page busy for other processes, and the * pagedaemon. If it still isn't completely valid * (readable), jump to readrest, else break-out ( we * found the page ). */ vm_page_busy(fs.m); if (fs.m->valid != VM_PAGE_BITS_ALL) goto readrest; break; } /* * Page is not resident, If this is the search termination * or the pager might contain the page, allocate a new page. */ if (TRYPAGER || fs.object == fs.first_object) { if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At * worst, the P_KILLED might be not observed * there, and allocation can fail, causing * restart and new reading of the p_flag. */ fs.m = NULL; if (!vm_page_count_severe() || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 if ((fs.object->flags & OBJ_COLORED) == 0) { fs.object->flags |= OBJ_COLORED; fs.object->pg_color = atop(vaddr) - fs.pindex; } #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs.object->type != OBJT_VNODE && fs.object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; fs.m = vm_page_alloc(fs.object, fs.pindex, alloc_req); } if (fs.m == NULL) { unlock_and_deallocate(&fs); VM_WAITPFAULT; goto RetryFault; } else if (fs.m->valid == VM_PAGE_BITS_ALL) break; } readrest: /* * We have found a valid page or we have allocated a new page. * The page thus may not be valid or may not be entirely * valid. * * Attempt to fault-in the page if there is a chance that the * pager has it, and potentially fault in additional pages * at the same time. */ if (TRYPAGER) { int rv; u_char behavior = vm_map_entry_behavior(fs.entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { behind = 0; ahead = atop(fs.entry->end - vaddr) - 1; if (ahead > VM_FAULT_READ_AHEAD_MAX) ahead = VM_FAULT_READ_AHEAD_MAX; if (fs.pindex == fs.entry->next_read) vm_fault_cache_behind(&fs, VM_FAULT_READ_MAX); } else { /* * If this is a sequential page fault, then * arithmetically increase the number of pages * in the read-ahead window. Otherwise, reset * the read-ahead window to its smallest size. */ behind = atop(vaddr - fs.entry->start); if (behind > VM_FAULT_READ_BEHIND) behind = VM_FAULT_READ_BEHIND; ahead = atop(fs.entry->end - vaddr) - 1; era = fs.entry->read_ahead; if (fs.pindex == fs.entry->next_read) { nera = era + behind; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; behind = 0; if (ahead > nera) ahead = nera; if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_cache_behind(&fs, VM_FAULT_CACHE_BEHIND); } else if (ahead > VM_FAULT_READ_AHEAD_MIN) ahead = VM_FAULT_READ_AHEAD_MIN; if (era != ahead) fs.entry->read_ahead = ahead; } /* * Call the pager to retrieve the data, if any, after * releasing the lock on the map. We hold a ref on * fs.object and the pages are VPO_BUSY'd. */ unlock_map(&fs); vnode_lock: if (fs.object->type == OBJT_VNODE) { vp = fs.object->handle; if (vp == fs.vp) goto vnode_locked; else if (fs.vp != NULL) { vput(fs.vp); fs.vp = NULL; } locked = VOP_ISLOCKED(vp); if (VFS_NEEDSGIANT(vp->v_mount) && !fs.vfslocked) { fs.vfslocked = 1; if (!mtx_trylock(&Giant)) { VM_OBJECT_UNLOCK(fs.object); mtx_lock(&Giant); VM_OBJECT_LOCK(fs.object); goto vnode_lock; } } if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* Do not sleep for vnode lock while fs.m is busy */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error != 0) { int vfslocked; vfslocked = fs.vfslocked; fs.vfslocked = 0; /* Keep Giant */ vhold(vp); release_page(&fs); unlock_and_deallocate(&fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); vdrop(vp); fs.vp = vp; fs.vfslocked = vfslocked; KASSERT(error == 0, ("vm_fault: vget failed")); goto RetryFault; } fs.vp = vp; } vnode_locked: KASSERT(fs.vp == NULL || !fs.map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * now we find out if any other pages should be paged * in at this time this routine checks to see if the * pages surrounding this fault reside in the same * object as the page for this fault. If they do, * then they are faulted in also into the object. The * array "marray" returned contains an array of * vm_page_t structs where one of them is the * vm_page_t passed to the routine. The reqpage * return value is the index into the marray for the * vm_page_t passed to the routine. * * fs.m plus the additional pages are VPO_BUSY'd. */ faultcount = vm_fault_additional_pages( fs.m, behind, ahead, marray, &reqpage); rv = faultcount ? vm_pager_get_pages(fs.object, marray, faultcount, reqpage) : VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* * Found the page. Leave it busy while we play * with it. */ /* * Relookup in case pager changed page. Pager * is responsible for disposition of old page * if moved. */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (!fs.m) { unlock_and_deallocate(&fs); goto RetryFault; } hardfault++; break; /* break to PAGE HAS BEEN FOUND */ } /* * Remove the bogus page (which does not exist at this * object/offset); before doing so, we must get back * our object lock to preserve our invariant. * * Also wake up any other process that may want to bring * in this page. * * If this is the top-level object, we must leave the * busy page to prevent another process from rushing * past us, and inserting the page in that object at * the same time that we are. */ if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * Data outside the range of the pager or an I/O error */ /* * XXX - the check for kernel_map is a kludge to work * around having the machine panic on a kernel space * fault w/ I/O error. */ if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } if (fs.object != fs.first_object) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; /* * XXX - we cannot just fall out at this * point, m has been freed and is invalid! */ } } /* * We get here if the object has default pager (or unwiring) * or the pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); next_object = fs.object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); VM_OBJECT_UNLOCK(fs.object); fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; VM_OBJECT_LOCK(fs.object); } fs.first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs.m->flags & PG_ZERO) == 0) { pmap_zero_page(fs.m); } else { PCPU_INC(cnt.v_ozfod); } PCPU_INC(cnt.v_zfod); fs.m->valid = VM_PAGE_BITS_ALL; break; /* break to PAGE HAS BEEN FOUND */ } else { KASSERT(fs.object != next_object, ("object loop %p", next_object)); VM_OBJECT_LOCK(next_object); vm_object_pip_add(next_object, 1); if (fs.object != fs.first_object) vm_object_pip_wakeup(fs.object); VM_OBJECT_UNLOCK(fs.object); fs.object = next_object; } } KASSERT((fs.m->oflags & VPO_BUSY) != 0, ("vm_fault: not busy after main loop")); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { /* * This allows pages to be virtually copied from a * backing_object into the first_object, where the * backing object has no other refs to it, and cannot * gain any more refs. Instead of a bcopy, we just * move the page from the backing object to the * first object. Note that we must mark the page * dirty in the first object so that it will go out * to swap when needed. */ is_first_object_locked = FALSE; if ( /* * Only one shadow object */ (fs.object->shadow_count == 1) && /* * No COW refs, except us */ (fs.object->ref_count == 1) && /* * No one else can look this object up */ (fs.object->handle == NULL) && /* * No other ways to look the object up */ ((fs.object->type == OBJT_DEFAULT) || (fs.object->type == OBJT_SWAP)) && (is_first_object_locked = VM_OBJECT_TRYLOCK(fs.first_object)) && /* * We don't chase down the shadow chain */ fs.object == fs.first_object->backing_object) { /* * get rid of the unnecessary page */ vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); /* * grab the page and put it into the * process'es object. The page is * automatically made dirty. */ vm_page_lock(fs.m); vm_page_rename(fs.m, fs.first_object, fs.first_pindex); vm_page_unlock(fs.m); vm_page_busy(fs.m); fs.first_m = fs.m; fs.m = NULL; PCPU_INC(cnt.v_cow_optim); } else { /* * Oh, well, lets copy it. */ pmap_copy_page(fs.m, fs.first_m); fs.first_m->valid = VM_PAGE_BITS_ALL; if (wired && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) { vm_page_lock(fs.first_m); vm_page_wire(fs.first_m); vm_page_unlock(fs.first_m); vm_page_lock(fs.m); vm_page_unwire(fs.m, FALSE); vm_page_unlock(fs.m); } /* * We no longer need the old page or object. */ release_page(&fs); } /* * fs.object != fs.first_object due to above * conditional */ vm_object_pip_wakeup(fs.object); VM_OBJECT_UNLOCK(fs.object); /* * Only use the new page below... */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; if (!is_first_object_locked) VM_OBJECT_LOCK(fs.object); PCPU_INC(cnt.v_cow_faults); curthread->td_cow++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } fs.lookup_still_valid = TRUE; if (fs.map->timestamp != map_generation) { result = vm_map_lookup_locked(&fs.map, vaddr, fault_type, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); /* * If we don't need the page any longer, put it on the inactive * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { release_page(&fs); unlock_and_deallocate(&fs); /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) goto RetryFault; return (result); } if ((retry_object != fs.first_object) || (retry_pindex != fs.first_pindex)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; } } /* * If the page was filled by a pager, update the map entry's * last read offset. Since the pager does not return the * actual set of pages that it read, this update is based on * the requested set. Typically, the requested and actual * sets are the same. * * XXX The following assignment modifies the map * without holding a write lock on it. */ if (hardfault) fs.entry->next_read = fs.pindex + faultcount - reqpage; if ((prot & VM_PROT_WRITE) != 0 || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_object_set_writeable_dirty(fs.object); /* * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if (fs.entry->eflags & MAP_ENTRY_NOSYNC) { if (fs.m->dirty == 0) fs.m->oflags |= VPO_NOSYNC; } else { fs.m->oflags &= ~VPO_NOSYNC; } /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also tell the backing pager, if any, that it should remove * any swap backing since the page is now dirty. */ if (((fault_type & VM_PROT_WRITE) != 0 && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_page_dirty(fs.m); vm_pager_page_unswapped(fs.m); } } /* * Page had better still be busy */ KASSERT(fs.m->oflags & VPO_BUSY, ("vm_fault: page %p not busy!", fs.m)); /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ KASSERT(fs.m->valid == VM_PAGE_BITS_ALL, ("vm_fault: page %p partially invalid", fs.m)); VM_OBJECT_UNLOCK(fs.object); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired); if ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 && wired == 0) vm_fault_prefault(fs.map->pmap, vaddr, fs.entry); VM_OBJECT_LOCK(fs.object); vm_page_lock(fs.m); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if (fault_flags & VM_FAULT_CHANGE_WIRING) { if (wired) vm_page_wire(fs.m); else vm_page_unwire(fs.m, 1); } else vm_page_activate(fs.m); if (m_hold != NULL) { *m_hold = fs.m; vm_page_hold(fs.m); } vm_page_unlock(fs.m); vm_page_wakeup(fs.m); /* * Unlock everything, and return */ unlock_and_deallocate(&fs); if (hardfault) curthread->td_ru.ru_majflt++; else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); }
static void mac_proc_vm_revoke_recurse(struct thread *td, struct ucred *cred, struct vm_map *map) { vm_map_entry_t vme; int vfslocked, result; vm_prot_t revokeperms; vm_object_t backing_object, object; vm_ooffset_t offset; struct vnode *vp; struct mount *mp; if (!mac_mmap_revocation) return; vm_map_lock(map); for (vme = map->header.next; vme != &map->header; vme = vme->next) { if (vme->eflags & MAP_ENTRY_IS_SUB_MAP) { mac_proc_vm_revoke_recurse(td, cred, vme->object.sub_map); continue; } /* * Skip over entries that obviously are not shared. */ if (vme->eflags & (MAP_ENTRY_COW | MAP_ENTRY_NOSYNC) || !vme->max_protection) continue; /* * Drill down to the deepest backing object. */ offset = vme->offset; object = vme->object.vm_object; if (object == NULL) continue; VM_OBJECT_LOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_LOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_UNLOCK(object); object = backing_object; } VM_OBJECT_UNLOCK(object); /* * At the moment, vm_maps and objects aren't considered by * the MAC system, so only things with backing by a normal * object (read: vnodes) are checked. */ if (object->type != OBJT_VNODE) continue; vp = (struct vnode *)object->handle; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); result = vme->max_protection; mac_vnode_check_mmap_downgrade(cred, vp, &result); VOP_UNLOCK(vp, 0); /* * Find out what maximum protection we may be allowing now * but a policy needs to get removed. */ revokeperms = vme->max_protection & ~result; if (!revokeperms) { VFS_UNLOCK_GIANT(vfslocked); continue; } printf("pid %ld: revoking %s perms from %#lx:%ld " "(max %s/cur %s)\n", (long)td->td_proc->p_pid, prot2str(revokeperms), (u_long)vme->start, (long)(vme->end - vme->start), prot2str(vme->max_protection), prot2str(vme->protection)); /* * This is the really simple case: if a map has more * max_protection than is allowed, but it's not being * actually used (that is, the current protection is still * allowed), we can just wipe it out and do nothing more. */ if ((vme->protection & revokeperms) == 0) { vme->max_protection -= revokeperms; } else { if (revokeperms & VM_PROT_WRITE) { /* * In the more complicated case, flush out all * pending changes to the object then turn it * copy-on-write. */ vm_object_reference(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VM_OBJECT_LOCK(object); vm_object_page_clean(object, offset, offset + vme->end - vme->start, OBJPC_SYNC); VM_OBJECT_UNLOCK(object); VOP_UNLOCK(vp, 0); vn_finished_write(mp); vm_object_deallocate(object); /* * Why bother if there's no read permissions * anymore? For the rest, we need to leave * the write permissions on for COW, or * remove them entirely if configured to. */ if (!mac_mmap_revocation_via_cow) { vme->max_protection &= ~VM_PROT_WRITE; vme->protection &= ~VM_PROT_WRITE; } if ((revokeperms & VM_PROT_READ) == 0) vme->eflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY; } if (revokeperms & VM_PROT_EXECUTE) { vme->max_protection &= ~VM_PROT_EXECUTE; vme->protection &= ~VM_PROT_EXECUTE; } if (revokeperms & VM_PROT_READ) { vme->max_protection = 0; vme->protection = 0; } pmap_protect(map->pmap, vme->start, vme->end, vme->protection & ~revokeperms); vm_map_simplify_entry(map, vme); } VFS_UNLOCK_GIANT(vfslocked); } vm_map_unlock(map); }