/* * non-anonymous, non-stack descriptor mappings only! * * This routine mostly snarfed from vm/vm_mmap.c */ int fp_mmap(void *addr_arg, size_t size, int prot, int flags, struct file *fp, off_t pos, void **resp) { struct thread *td = curthread; struct proc *p = td->td_proc; vm_size_t pageoff; vm_prot_t maxprot; vm_offset_t addr; void *handle; int error; vm_object_t obj; struct vmspace *vms = p->p_vmspace; struct vnode *vp; prot &= VM_PROT_ALL; if ((ssize_t)size < 0 || (flags & MAP_ANON)) return(EINVAL); pageoff = (pos & PAGE_MASK); pos -= pageoff; /* Adjust size for rounding (on both ends). */ size += pageoff; /* low end... */ size = (vm_size_t)round_page(size); /* hi end */ addr = (vm_offset_t)addr_arg; /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & MAP_FIXED) { /* * The specified address must have the same remainder * as the file offset taken modulo PAGE_SIZE, so it * should be aligned after adjustment by pageoff. */ addr -= pageoff; if (addr & PAGE_MASK) return (EINVAL); /* Address range must be all in user VM space. */ if (VM_MAX_USER_ADDRESS > 0 && addr + size > VM_MAX_USER_ADDRESS) return (EINVAL); if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) return (EINVAL); if (addr + size < addr) return (EINVAL); } else if (addr == 0 || (addr >= round_page((vm_offset_t)vms->vm_taddr) && addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz)) ) { /* * XXX for non-fixed mappings where no hint is provided or * the hint would fall in the potential heap space, * place it after the end of the largest possible heap. * * There should really be a pmap call to determine a reasonable * location. */ addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz); } /* * Mapping file, get fp for validation. Obtain vnode and make * sure it is of appropriate type. */ if (fp->f_type != DTYPE_VNODE) return (EINVAL); /* * POSIX shared-memory objects are defined to have * kernel persistence, and are not defined to support * read(2)/write(2) -- or even open(2). Thus, we can * use MAP_ASYNC to trade on-disk coherence for speed. * The shm_open(3) library routine turns on the FPOSIXSHM * flag to request this behavior. */ if (fp->f_flag & FPOSIXSHM) flags |= MAP_NOSYNC; vp = (struct vnode *) fp->f_data; if (vp->v_type != VREG && vp->v_type != VCHR) return (EINVAL); /* * Get the proper underlying object */ if (vp->v_type == VREG) { if ((obj = vp->v_object) == NULL) return (EINVAL); KKASSERT(vp == (struct vnode *)obj->handle); } /* * XXX hack to handle use of /dev/zero to map anon memory (ala * SunOS). */ if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { handle = NULL; maxprot = VM_PROT_ALL; flags |= MAP_ANON; pos = 0; } else { /* * cdevs does not provide private mappings of any kind. */ if (vp->v_type == VCHR && (flags & (MAP_PRIVATE|MAP_COPY))) { error = EINVAL; goto done; } /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? What if * proc does a setuid? */ maxprot = VM_PROT_EXECUTE; /* ??? */ if (fp->f_flag & FREAD) { maxprot |= VM_PROT_READ; } else if (prot & PROT_READ) { error = EACCES; goto done; } /* * If we are sharing potential changes (either via * MAP_SHARED or via the implicit sharing of character * device mappings), and we are trying to get write * permission although we opened it without asking * for it, bail out. */ if ((flags & MAP_SHARED) != 0 || (vp->v_type == VCHR) ) { if ((fp->f_flag & FWRITE) != 0) { struct vattr va; if ((error = VOP_GETATTR(vp, &va))) { goto done; } if ((va.va_flags & (IMMUTABLE|APPEND)) == 0) { maxprot |= VM_PROT_WRITE; } else if (prot & PROT_WRITE) { error = EPERM; goto done; } } else if ((prot & PROT_WRITE) != 0) { error = EACCES; goto done; } } else { maxprot |= VM_PROT_WRITE; } handle = (void *)vp; } error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, flags, handle, pos); if (error == 0 && addr_arg) *resp = (void *)addr; done: return (error); }
/* * mmap_args(void *addr, size_t len, int prot, int flags, int fd, * long pad, off_t pos) * * Memory Map (mmap) system call. Note that the file offset * and address are allowed to be NOT page aligned, though if * the MAP_FIXED flag it set, both must have the same remainder * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not * page-aligned, the actual mapping starts at trunc_page(addr) * and the return value is adjusted up by the page offset. * * Generally speaking, only character devices which are themselves * memory-based, such as a video framebuffer, can be mmap'd. Otherwise * there would be no cache coherency between a descriptor and a VM mapping * both to the same character device. * * Block devices can be mmap'd no matter what they represent. Cache coherency * is maintained as long as you do not write directly to the underlying * character device. * * No requirements */ int kern_mmap(struct vmspace *vms, caddr_t uaddr, size_t ulen, int uprot, int uflags, int fd, off_t upos, void **res) { struct thread *td = curthread; struct proc *p = td->td_proc; struct file *fp = NULL; struct vnode *vp; vm_offset_t addr; vm_offset_t tmpaddr; vm_size_t size, pageoff; vm_prot_t prot, maxprot; void *handle; int flags, error; off_t pos; vm_object_t obj; KKASSERT(p); addr = (vm_offset_t) uaddr; size = ulen; prot = uprot & VM_PROT_ALL; flags = uflags; pos = upos; /* * Make sure mapping fits into numeric range etc. * * NOTE: We support the full unsigned range for size now. */ if (((flags & MAP_ANON) && (fd != -1 || pos != 0))) return (EINVAL); if (size == 0) return (EINVAL); if (flags & MAP_STACK) { if ((fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) return (EINVAL); flags |= MAP_ANON; pos = 0; } /* * Virtual page tables cannot be used with MAP_STACK. Apart from * it not making any sense, the aux union is used by both * types. * * Because the virtual page table is stored in the backing object * and might be updated by the kernel, the mapping must be R+W. */ if (flags & MAP_VPAGETABLE) { if (vkernel_enable == 0) return (EOPNOTSUPP); if (flags & MAP_STACK) return (EINVAL); if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE)) return (EINVAL); } /* * Align the file position to a page boundary, * and save its page offset component. */ pageoff = (pos & PAGE_MASK); pos -= pageoff; /* Adjust size for rounding (on both ends). */ size += pageoff; /* low end... */ size = (vm_size_t) round_page(size); /* hi end */ if (size < ulen) /* wrap */ return(EINVAL); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & (MAP_FIXED | MAP_TRYFIXED)) { /* * The specified address must have the same remainder * as the file offset taken modulo PAGE_SIZE, so it * should be aligned after adjustment by pageoff. */ addr -= pageoff; if (addr & PAGE_MASK) return (EINVAL); /* * Address range must be all in user VM space and not wrap. */ tmpaddr = addr + size; if (tmpaddr < addr) return (EINVAL); if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) return (EINVAL); if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) return (EINVAL); } else { /* * Get a hint of where to map. It also provides mmap offset * randomization if enabled. */ addr = vm_map_hint(p, addr, prot); } if (flags & MAP_ANON) { /* * Mapping blank space is trivial. */ handle = NULL; maxprot = VM_PROT_ALL; } else { /* * Mapping file, get fp for validation. Obtain vnode and make * sure it is of appropriate type. */ fp = holdfp(p->p_fd, fd, -1); if (fp == NULL) return (EBADF); if (fp->f_type != DTYPE_VNODE) { error = EINVAL; goto done; } /* * POSIX shared-memory objects are defined to have * kernel persistence, and are not defined to support * read(2)/write(2) -- or even open(2). Thus, we can * use MAP_ASYNC to trade on-disk coherence for speed. * The shm_open(3) library routine turns on the FPOSIXSHM * flag to request this behavior. */ if (fp->f_flag & FPOSIXSHM) flags |= MAP_NOSYNC; vp = (struct vnode *) fp->f_data; /* * Validate the vnode for the operation. */ switch(vp->v_type) { case VREG: /* * Get the proper underlying object */ if ((obj = vp->v_object) == NULL) { error = EINVAL; goto done; } KKASSERT((struct vnode *)obj->handle == vp); break; case VCHR: /* * Make sure a device has not been revoked. * Mappability is handled by the device layer. */ if (vp->v_rdev == NULL) { error = EBADF; goto done; } break; default: /* * Nothing else is mappable. */ error = EINVAL; goto done; } /* * XXX hack to handle use of /dev/zero to map anon memory (ala * SunOS). */ if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { handle = NULL; maxprot = VM_PROT_ALL; flags |= MAP_ANON; pos = 0; } else { /* * cdevs does not provide private mappings of any kind. */ if (vp->v_type == VCHR && (flags & (MAP_PRIVATE|MAP_COPY))) { error = EINVAL; goto done; } /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? What if * proc does a setuid? */ maxprot = VM_PROT_EXECUTE; /* ??? */ if (fp->f_flag & FREAD) { maxprot |= VM_PROT_READ; } else if (prot & PROT_READ) { error = EACCES; goto done; } /* * If we are sharing potential changes (either via * MAP_SHARED or via the implicit sharing of character * device mappings), and we are trying to get write * permission although we opened it without asking * for it, bail out. Check for superuser, only if * we're at securelevel < 1, to allow the XIG X server * to continue to work. */ if ((flags & MAP_SHARED) != 0 || vp->v_type == VCHR) { if ((fp->f_flag & FWRITE) != 0) { struct vattr va; if ((error = VOP_GETATTR(vp, &va))) { goto done; } if ((va.va_flags & (IMMUTABLE|APPEND)) == 0) { maxprot |= VM_PROT_WRITE; } else if (prot & PROT_WRITE) { error = EPERM; goto done; } } else if ((prot & PROT_WRITE) != 0) { error = EACCES; goto done; } } else { maxprot |= VM_PROT_WRITE; } handle = (void *)vp; } } lwkt_gettoken(&vms->vm_map.token); /* * Do not allow more then a certain number of vm_map_entry structures * per process. Scale with the number of rforks sharing the map * to make the limit reasonable for threads. */ if (max_proc_mmap && vms->vm_map.nentries >= max_proc_mmap * vmspace_getrefs(vms)) { error = ENOMEM; lwkt_reltoken(&vms->vm_map.token); goto done; } error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, flags, handle, pos); if (error == 0) *res = (void *)(addr + pageoff); lwkt_reltoken(&vms->vm_map.token); done: if (fp) fdrop(fp); return (error); }
int sys_mmap(struct proc *p, void *v, register_t *retval) { struct sys_mmap_args /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(int) prot; syscallarg(int) flags; syscallarg(int) fd; syscallarg(long) pad; syscallarg(off_t) pos; } */ *uap = v; vaddr_t addr; struct vattr va; off_t pos; vsize_t size, pageoff; vm_prot_t prot, maxprot; int flags, fd; vaddr_t vm_min_address = VM_MIN_ADDRESS; struct filedesc *fdp = p->p_fd; struct file *fp = NULL; struct vnode *vp; caddr_t handle; int error; /* * first, extract syscall args from the uap. */ addr = (vaddr_t) SCARG(uap, addr); size = (vsize_t) SCARG(uap, len); prot = SCARG(uap, prot); flags = SCARG(uap, flags); fd = SCARG(uap, fd); pos = SCARG(uap, pos); /* * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and * validate the flags. */ if ((prot & VM_PROT_ALL) != prot) return (EINVAL); if ((flags & MAP_FLAGMASK) != flags) return (EINVAL); if (flags & MAP_COPY) flags = (flags & ~MAP_COPY) | MAP_PRIVATE; if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE)) return (EINVAL); if (flags & MAP_DENYWRITE) return (EINVAL); /* * align file position and save offset. adjust size. */ ALIGN_ADDR(pos, size, pageoff); /* * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr" */ if (flags & MAP_FIXED) { /* adjust address by the same amount as we did the offset */ addr -= pageoff; if (addr & PAGE_MASK) return (EINVAL); /* not page aligned */ if (addr > SIZE_MAX - size) return (EINVAL); /* no wrapping! */ if (VM_MAXUSER_ADDRESS > 0 && (addr + size) > VM_MAXUSER_ADDRESS) return (EINVAL); if (vm_min_address > 0 && addr < vm_min_address) return (EINVAL); } else { /* * not fixed: make sure we skip over the largest possible heap. * we will refine our guess later (e.g. to account for VAC, etc) */ if (addr == 0) addr = uvm_map_hint(p, prot); else if (!(flags & MAP_TRYFIXED) && addr < (vaddr_t)p->p_vmspace->vm_daddr) addr = uvm_map_hint(p, prot); } /* * check for file mappings (i.e. not anonymous) and verify file. */ if ((flags & MAP_ANON) == 0) { if ((fp = fd_getfile(fdp, fd)) == NULL) return (EBADF); FREF(fp); if (fp->f_type != DTYPE_VNODE) { error = ENODEV; /* only mmap vnodes! */ goto out; } vp = (struct vnode *)fp->f_data; /* convert to vnode */ if (vp->v_type != VREG && vp->v_type != VCHR && vp->v_type != VBLK) { error = ENODEV; /* only REG/CHR/BLK support mmap */ goto out; } if (vp->v_type == VREG && (pos + size) < pos) { error = EINVAL; /* no offset wrapping */ goto out; } /* special case: catch SunOS style /dev/zero */ if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { flags |= MAP_ANON; FRELE(fp); fp = NULL; goto is_anon; } /* * Old programs may not select a specific sharing type, so * default to an appropriate one. * * XXX: how does MAP_ANON fit in the picture? */ if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) { #if defined(DEBUG) printf("WARNING: defaulted mmap() share type to " "%s (pid %d comm %s)\n", vp->v_type == VCHR ? "MAP_SHARED" : "MAP_PRIVATE", p->p_pid, p->p_comm); #endif if (vp->v_type == VCHR) flags |= MAP_SHARED; /* for a device */ else flags |= MAP_PRIVATE; /* for a file */ } /* * MAP_PRIVATE device mappings don't make sense (and aren't * supported anyway). However, some programs rely on this, * so just change it to MAP_SHARED. */ if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) { flags = (flags & ~MAP_PRIVATE) | MAP_SHARED; } #ifdef ANOUBIS /* Force DENYWRITE mappings if file->denywrite is set. */ if (fp->denywrite) flags |= MAP_DENYWRITE; #endif /* * now check protection */ /* * Don't allow the file to be mapped into executable memory if * the underlying file system is marked as 'noexec'. */ if (prot & PROT_EXEC && vp->v_mount->mnt_flag & MNT_NOEXEC) { error = EACCES; goto out; } maxprot = VM_PROT_EXECUTE; /* check read access */ if (fp->f_flag & FREAD) maxprot |= VM_PROT_READ; else if (prot & PROT_READ) { error = EACCES; goto out; } /* PROT_EXEC only makes sense if the descriptor is readable. */ if (!(fp->f_flag & FREAD) && prot & PROT_EXEC) { error = EACCES; goto out; } /* check write access, shared case first */ if (flags & MAP_SHARED) { /* * if the file is writable, only add PROT_WRITE to * maxprot if the file is not immutable, append-only. * otherwise, if we have asked for PROT_WRITE, return * EPERM. */ if (fp->f_flag & FWRITE) { if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p))) goto out; if ((va.va_flags & (IMMUTABLE|APPEND)) == 0) maxprot |= VM_PROT_WRITE; else if (prot & PROT_WRITE) { error = EPERM; goto out; } } else if (prot & PROT_WRITE) { error = EACCES; goto out; } } else { /* MAP_PRIVATE mappings can always write to */ maxprot |= VM_PROT_WRITE; } #ifdef MAC error = mac_vnode_check_mmap(p->p_ucred, vp, prot, flags); if (error) goto out; #endif vfs_mark_atime(vp, p->p_ucred); /* * set handle to vnode */ handle = (caddr_t)vp; } else { /* MAP_ANON case */ /* * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0? */ if (fd != -1) { error = EINVAL; goto out; } is_anon: /* label for SunOS style /dev/zero */ handle = NULL; maxprot = VM_PROT_ALL; pos = 0; } if ((flags & MAP_ANON) != 0 || ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) { if (size > (p->p_rlimit[RLIMIT_DATA].rlim_cur - ptoa(p->p_vmspace->vm_dused))) { error = ENOMEM; goto out; } } /* * now let kernel internal function uvm_mmap do the work. */ error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot, flags, handle, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur, p); if (error == 0) /* remember to add offset */ *retval = (register_t)(addr + pageoff); out: if (fp) FRELE(fp); return (error); }