/* * Copy a binary buffer from kernel space to user space. * * Returns 0 on success, EFAULT on failure. */ int copyout(const void *kaddr, void *udaddr, size_t len) { struct vmspace *vm = curproc->p_vmspace; struct lwbuf *lwb; struct lwbuf lwb_cache; vm_page_t m; int error; size_t n; error = 0; while (len) { m = vm_fault_page(&vm->vm_map, trunc_page((vm_offset_t)udaddr), VM_PROT_READ|VM_PROT_WRITE, VM_FAULT_NORMAL, &error); if (error) break; n = PAGE_SIZE - ((vm_offset_t)udaddr & PAGE_MASK); if (n > len) n = len; lwb = lwbuf_alloc(m, &lwb_cache); bcopy(kaddr, (char *)lwbuf_kva(lwb) + ((vm_offset_t)udaddr & PAGE_MASK), n); len -= n; udaddr = (char *)udaddr + n; kaddr = (const char *)kaddr + n; vm_page_dirty(m); lwbuf_free(lwb); vm_page_unhold(m); } return (error); }
/* * Copy the specified number of bytes from userland to the xio. * Return an error code or 0 on success. * * uoffset is the abstracted starting offset in the XIO, not the actual * offset, and usually starts at 0. * * Data in pages backing the XIO will be modified. */ int xio_copy_utox(xio_t xio, int uoffset, const void *uptr, int bytes) { int i; int n; int error; int offset; vm_page_t m; struct lwbuf *lwb; struct lwbuf lwb_cache; if (uoffset + bytes > xio->xio_bytes) return(EFAULT); offset = (xio->xio_offset + uoffset) & PAGE_MASK; if ((n = PAGE_SIZE - offset) > bytes) n = bytes; error = 0; for (i = (xio->xio_offset + uoffset) >> PAGE_SHIFT; i < xio->xio_npages; ++i ) { m = xio->xio_pages[i]; lwb = lwbuf_alloc(m, &lwb_cache); error = copyin(uptr, (char *)lwbuf_kva(lwb) + offset, n); lwbuf_free(lwb); if (error) break; bytes -= n; uptr = (const char *)uptr + n; if (bytes == 0) break; if ((n = bytes) > PAGE_SIZE) n = PAGE_SIZE; offset = 0; } return(error); }
/* * Copy the specified number of bytes from the xio to a kernel * buffer. Return an error code or 0 on success. * * uoffset is the abstracted starting offset in the XIO, not the actual * offset, and usually starts at 0. * * The XIO is not modified. */ int xio_copy_xtok(xio_t xio, int uoffset, void *kptr, int bytes) { int i; int n; int error; int offset; vm_page_t m; struct lwbuf *lwb; struct lwbuf lwb_cache; if (bytes + uoffset > xio->xio_bytes) return(EFAULT); offset = (xio->xio_offset + uoffset) & PAGE_MASK; if ((n = PAGE_SIZE - offset) > bytes) n = bytes; error = 0; for (i = (xio->xio_offset + uoffset) >> PAGE_SHIFT; i < xio->xio_npages; ++i ) { m = xio->xio_pages[i]; lwb = lwbuf_alloc(m, &lwb_cache); bcopy((char *)lwbuf_kva(lwb) + offset, kptr, n); lwbuf_free(lwb); bytes -= n; kptr = (char *)kptr + n; if (bytes == 0) break; if ((n = bytes) > PAGE_SIZE) n = PAGE_SIZE; offset = 0; } return(error); }
/* * Implement uiomove(9) from physical memory using lwbuf's to reduce * the creation and destruction of ephemeral mappings. */ int uiomove_fromphys(vm_page_t *ma, vm_offset_t offset, size_t n, struct uio *uio) { struct lwbuf lwb_cache; struct lwbuf *lwb; struct thread *td = curthread; struct iovec *iov; void *cp; vm_offset_t page_offset; vm_page_t m; size_t cnt; int error = 0; int save = 0; KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, ("uiomove_fromphys: mode")); KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, ("uiomove_fromphys proc")); crit_enter(); save = td->td_flags & TDF_DEADLKTREAT; td->td_flags |= TDF_DEADLKTREAT; crit_exit(); while (n > 0 && uio->uio_resid) { iov = uio->uio_iov; cnt = iov->iov_len; if (cnt == 0) { uio->uio_iov++; uio->uio_iovcnt--; continue; } if (cnt > n) cnt = n; page_offset = offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - page_offset); m = ma[offset >> PAGE_SHIFT]; lwb = lwbuf_alloc(m, &lwb_cache); cp = (char *)lwbuf_kva(lwb) + page_offset; switch (uio->uio_segflg) { case UIO_USERSPACE: /* * note: removed uioyield (it was the wrong place to * put it). */ if (uio->uio_rw == UIO_READ) error = copyout(cp, iov->iov_base, cnt); else error = copyin(iov->iov_base, cp, cnt); if (error) { lwbuf_free(lwb); goto out; } break; case UIO_SYSSPACE: if (uio->uio_rw == UIO_READ) bcopy(cp, iov->iov_base, cnt); else bcopy(iov->iov_base, cp, cnt); break; case UIO_NOCOPY: break; } lwbuf_free(lwb); iov->iov_base = (char *)iov->iov_base + cnt; iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; offset += cnt; n -= cnt; } out: if (save == 0) { crit_enter(); td->td_flags &= ~TDF_DEADLKTREAT; crit_exit(); } return (error); }
/* * Lets the VM system know about a change in size for a file. * We adjust our own internal size and flush any cached pages in * the associated object that are affected by the size change. * * NOTE: This routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. * * NOTE: vp->v_filesize is initialized to NOOFFSET (-1), be sure that * we do not blow up on the case. nsize will always be >= 0, however. */ void vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) { vm_pindex_t nobjsize; vm_pindex_t oobjsize; vm_object_t object; object = vp->v_object; if (object == NULL) return; vm_object_hold(object); KKASSERT(vp->v_object == object); /* * Hasn't changed size */ if (nsize == vp->v_filesize) { vm_object_drop(object); return; } /* * Has changed size. Adjust the VM object's size and v_filesize * before we start scanning pages to prevent new pages from being * allocated during the scan. */ nobjsize = OFF_TO_IDX(nsize + PAGE_MASK); oobjsize = object->size; object->size = nobjsize; /* * File has shrunk. Toss any cached pages beyond the new EOF. */ if (nsize < vp->v_filesize) { vp->v_filesize = nsize; if (nobjsize < oobjsize) { vm_object_page_remove(object, nobjsize, oobjsize, FALSE); } /* * This gets rid of garbage at the end of a page that is now * only partially backed by the vnode. Since we are setting * the entire page valid & clean after we are done we have * to be sure that the portion of the page within the file * bounds is already valid. If it isn't then making it * valid would create a corrupt block. */ if (nsize & PAGE_MASK) { vm_offset_t kva; vm_page_t m; m = vm_page_lookup_busy_wait(object, OFF_TO_IDX(nsize), TRUE, "vsetsz"); if (m && m->valid) { int base = (int)nsize & PAGE_MASK; int size = PAGE_SIZE - base; struct lwbuf *lwb; struct lwbuf lwb_cache; /* * Clear out partial-page garbage in case * the page has been mapped. * * This is byte aligned. */ lwb = lwbuf_alloc(m, &lwb_cache); kva = lwbuf_kva(lwb); bzero((caddr_t)kva + base, size); lwbuf_free(lwb); /* * XXX work around SMP data integrity race * by unmapping the page from user processes. * The garbage we just cleared may be mapped * to a user process running on another cpu * and this code is not running through normal * I/O channels which handle SMP issues for * us, so unmap page to synchronize all cpus. * * XXX should vm_pager_unmap_page() have * dealt with this? */ vm_page_protect(m, VM_PROT_NONE); /* * Clear out partial-page dirty bits. This * has the side effect of setting the valid * bits, but that is ok. There are a bunch * of places in the VM system where we expected * m->dirty == VM_PAGE_BITS_ALL. The file EOF * case is one of them. If the page is still * partially dirty, make it fully dirty. * * NOTE: We do not clear out the valid * bits. This would prevent bogus_page * replacement from working properly. * * NOTE: We do not want to clear the dirty * bit for a partial DEV_BSIZE'd truncation! * This is DEV_BSIZE aligned! */ vm_page_clear_dirty_beg_nonincl(m, base, size); if (m->dirty != 0) m->dirty = VM_PAGE_BITS_ALL; vm_page_wakeup(m); } else if (m) { vm_page_wakeup(m); } } } else { vp->v_filesize = nsize; } vm_object_drop(object); }
/* * A VFS can call this function to try to dispose of a read request * directly from the VM system, pretty much bypassing almost all VFS * overhead except for atime updates. * * If 0 is returned some or all of the uio was handled. The caller must * check the uio and handle the remainder. * * The caller must fail on a non-zero error. */ int vop_helper_read_shortcut(struct vop_read_args *ap) { struct vnode *vp; struct uio *uio; struct lwbuf *lwb; struct lwbuf lwb_cache; vm_object_t obj; vm_page_t m; int offset; int n; int error; vp = ap->a_vp; uio = ap->a_uio; /* * We can't short-cut if there is no VM object or this is a special * UIO_NOCOPY read (typically from VOP_STRATEGY()). We also can't * do this if we cannot extract the filesize from the vnode. */ if (vm_read_shortcut_enable == 0) return(0); if (vp->v_object == NULL || uio->uio_segflg == UIO_NOCOPY) return(0); if (vp->v_filesize == NOOFFSET) return(0); if (uio->uio_resid == 0) return(0); /* * Iterate the uio on a page-by-page basis * * XXX can we leave the object held shared during the uiomove()? */ ++vm_read_shortcut_count; obj = vp->v_object; vm_object_hold_shared(obj); error = 0; while (uio->uio_resid && error == 0) { offset = (int)uio->uio_offset & PAGE_MASK; n = PAGE_SIZE - offset; if (n > uio->uio_resid) n = uio->uio_resid; if (vp->v_filesize < uio->uio_offset) break; if (uio->uio_offset + n > vp->v_filesize) n = vp->v_filesize - uio->uio_offset; if (n == 0) break; /* hit EOF */ m = vm_page_lookup_busy_try(obj, OFF_TO_IDX(uio->uio_offset), FALSE, &error); if (error || m == NULL) { ++vm_read_shortcut_failed; error = 0; break; } if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { ++vm_read_shortcut_failed; vm_page_wakeup(m); break; } lwb = lwbuf_alloc(m, &lwb_cache); /* * Use a no-fault uiomove() to avoid deadlocking against * our VM object (which could livelock on the same object * due to shared-vs-exclusive), or deadlocking against * our busied page. Returns EFAULT on any fault which * winds up diving a vnode. */ error = uiomove_nofault((char *)lwbuf_kva(lwb) + offset, n, uio); vm_page_flag_set(m, PG_REFERENCED); lwbuf_free(lwb); vm_page_wakeup(m); } vm_object_drop(obj); /* * Ignore EFAULT since we used uiomove_nofault(), causes caller * to fall-back to normal code for this case. */ if (error == EFAULT) error = 0; return (error); }
/* * MPSAFE thread */ static void vm_pagezero(void *arg) { vm_page_t m = NULL; struct lwbuf *lwb = NULL; struct lwbuf lwb_cache; enum zeroidle_state state = STATE_IDLE; char *pg = NULL; int npages = 0; int sleep_time; int i = 0; int cpu = (int)(intptr_t)arg; int zero_state = 0; /* * Adjust thread parameters before entering our loop. The thread * is started with the MP lock held and with normal kernel thread * priority. * * Also put us on the last cpu for now. * * For now leave the MP lock held, the VM routines cannot be called * with it released until tokenization is finished. */ lwkt_setpri_self(TDPRI_IDLE_WORK); lwkt_setcpu_self(globaldata_find(cpu)); sleep_time = DEFAULT_SLEEP_TIME; /* * Loop forever */ for (;;) { int zero_count; switch(state) { case STATE_IDLE: /* * Wait for work. */ tsleep(&zero_state, 0, "pgzero", sleep_time); if (vm_page_zero_check(&zero_count, &zero_state)) npages = idlezero_rate / 10; sleep_time = vm_page_zero_time(zero_count); if (npages) state = STATE_GET_PAGE; /* Fallthrough */ break; case STATE_GET_PAGE: /* * Acquire page to zero */ if (--npages == 0) { state = STATE_IDLE; } else { m = vm_page_free_fromq_fast(); if (m == NULL) { state = STATE_IDLE; } else { state = STATE_ZERO_PAGE; lwb = lwbuf_alloc(m, &lwb_cache); pg = (char *)lwbuf_kva(lwb); i = 0; } } break; case STATE_ZERO_PAGE: /* * Zero-out the page */ while (i < PAGE_SIZE) { if (idlezero_nocache == 1) bzeront(&pg[i], IDLEZERO_RUN); else bzero(&pg[i], IDLEZERO_RUN); i += IDLEZERO_RUN; lwkt_yield(); } state = STATE_RELEASE_PAGE; break; case STATE_RELEASE_PAGE: lwbuf_free(lwb); vm_page_flag_set(m, PG_ZERO); vm_page_free_toq(m); state = STATE_GET_PAGE; ++idlezero_count; /* non-locked, SMP race ok */ break; } lwkt_yield(); } }