/* ARGSUSED */ int sys_mincore(struct lwp *l, const struct sys_mincore_args *uap, register_t *retval) { /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(char *) vec; } */ struct proc *p = l->l_proc; struct vm_page *pg; char *vec, pgi; struct uvm_object *uobj; struct vm_amap *amap; struct vm_anon *anon; struct vm_map_entry *entry; vaddr_t start, end, lim; struct vm_map *map; vsize_t len; int error = 0, npgs; map = &p->p_vmspace->vm_map; start = (vaddr_t)SCARG(uap, addr); len = SCARG(uap, len); vec = SCARG(uap, vec); if (start & PAGE_MASK) return EINVAL; len = round_page(len); end = start + len; if (end <= start) return EINVAL; /* * Lock down vec, so our returned status isn't outdated by * storing the status byte for a page. */ npgs = len >> PAGE_SHIFT; error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE); if (error) { return error; } vm_map_lock_read(map); if (uvm_map_lookup_entry(map, start, &entry) == false) { error = ENOMEM; goto out; } for (/* nothing */; entry != &map->header && entry->start < end; entry = entry->next) { KASSERT(!UVM_ET_ISSUBMAP(entry)); KASSERT(start >= entry->start); /* Make sure there are no holes. */ if (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end)) { error = ENOMEM; goto out; } lim = end < entry->end ? end : entry->end; /* * Special case for objects with no "real" pages. Those * are always considered resident (mapped devices). */ if (UVM_ET_ISOBJ(entry)) { KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)); if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) { for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) subyte(vec, 1); continue; } } amap = entry->aref.ar_amap; /* upper layer */ uobj = entry->object.uvm_obj; /* lower layer */ if (amap != NULL) amap_lock(amap); if (uobj != NULL) mutex_enter(uobj->vmobjlock); for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) { pgi = 0; if (amap != NULL) { /* Check the upper layer first. */ anon = amap_lookup(&entry->aref, start - entry->start); /* Don't need to lock anon here. */ if (anon != NULL && anon->an_page != NULL) { /* * Anon has the page for this entry * offset. */ pgi = 1; } } if (uobj != NULL && pgi == 0) { /* Check the lower layer. */ pg = uvm_pagelookup(uobj, entry->offset + (start - entry->start)); if (pg != NULL) { /* * Object has the page for this entry * offset. */ pgi = 1; } } (void) subyte(vec, pgi); } if (uobj != NULL) mutex_exit(uobj->vmobjlock); if (amap != NULL) amap_unlock(amap); } out: vm_map_unlock_read(map); uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs); return error; }
/* * Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly * from the raw device to user buffers, and bypasses the buffer cache. * * Comments in brackets are from Leffler, et al.'s pseudo-code implementation. */ int physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags, void (*min_phys)(struct buf *), struct uio *uio) { struct iovec *iovp; struct lwp *l = curlwp; struct proc *p = l->l_proc; int i, error; struct buf *bp = NULL; struct physio_stat *ps; int concurrency = PHYSIO_CONCURRENCY - 1; error = RUN_ONCE(&physio_initialized, physio_init); if (__predict_false(error != 0)) { return error; } DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n", __func__, uio->uio_offset, uio->uio_resid)); flags &= B_READ | B_WRITE; if ((ps = kmem_zalloc(sizeof(*ps), KM_SLEEP)) == NULL) return ENOMEM; /* ps->ps_running = 0; */ /* ps->ps_error = 0; */ /* ps->ps_failed = 0; */ ps->ps_orig_bp = obp; ps->ps_endoffset = -1; mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE); cv_init(&ps->ps_cv, "physio"); /* Make sure we have a buffer, creating one if necessary. */ if (obp != NULL) { /* [raise the processor priority level to splbio;] */ mutex_enter(&bufcache_lock); /* Mark it busy, so nobody else will use it. */ while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH) ; mutex_exit(&bufcache_lock); concurrency = 0; /* see "XXXkludge" comment below */ } uvm_lwp_hold(l); for (i = 0; i < uio->uio_iovcnt; i++) { bool sync = true; iovp = &uio->uio_iov[i]; while (iovp->iov_len > 0) { size_t todo; vaddr_t endp; mutex_enter(&ps->ps_lock); if (ps->ps_failed != 0) { goto done_locked; } physio_wait(ps, sync ? 0 : concurrency); mutex_exit(&ps->ps_lock); if (obp != NULL) { /* * XXXkludge * some drivers use "obp" as an identifier. */ bp = obp; } else { bp = getiobuf(NULL, true); bp->b_cflags = BC_BUSY; } bp->b_dev = dev; bp->b_proc = p; bp->b_private = ps; /* * [mark the buffer busy for physical I/O] * (i.e. set B_PHYS (because it's an I/O to user * memory, and B_RAW, because B_RAW is to be * "Set by physio for raw transfers.", in addition * to the "busy" and read/write flag.) */ bp->b_oflags = 0; bp->b_cflags = BC_BUSY; bp->b_flags = flags | B_PHYS | B_RAW; bp->b_iodone = physio_biodone; /* [set up the buffer for a maximum-sized transfer] */ bp->b_blkno = btodb(uio->uio_offset); if (dbtob(bp->b_blkno) != uio->uio_offset) { error = EINVAL; goto done; } bp->b_bcount = MIN(MAXPHYS, iovp->iov_len); bp->b_data = iovp->iov_base; /* * [call minphys to bound the transfer size] * and remember the amount of data to transfer, * for later comparison. */ (*min_phys)(bp); todo = bp->b_bufsize = bp->b_bcount; #if defined(DIAGNOSTIC) if (todo > MAXPHYS) panic("todo(%zu) > MAXPHYS; minphys broken", todo); #endif /* defined(DIAGNOSTIC) */ sync = false; endp = (vaddr_t)bp->b_data + todo; if (trunc_page(endp) != endp) { /* * following requests can overlap. * note that uvm_vslock does round_page. */ sync = true; } /* * [lock the part of the user address space involved * in the transfer] * Beware vmapbuf(); it clobbers b_data and * saves it in b_saveaddr. However, vunmapbuf() * restores it. */ error = uvm_vslock(p->p_vmspace, bp->b_data, todo, (flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ); if (error) { goto done; } vmapbuf(bp, todo); BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); mutex_enter(&ps->ps_lock); ps->ps_running++; mutex_exit(&ps->ps_lock); /* [call strategy to start the transfer] */ (*strategy)(bp); bp = NULL; iovp->iov_len -= todo; iovp->iov_base = (char *)iovp->iov_base + todo; uio->uio_offset += todo; uio->uio_resid -= todo; } } done: mutex_enter(&ps->ps_lock); done_locked: physio_wait(ps, 0); mutex_exit(&ps->ps_lock); if (ps->ps_failed != 0) { off_t delta; delta = uio->uio_offset - ps->ps_endoffset; KASSERT(delta > 0); uio->uio_resid += delta; /* uio->uio_offset = ps->ps_endoffset; */ } else { KASSERT(ps->ps_endoffset == -1); } if (bp != NULL && bp != obp) { putiobuf(bp); } if (error == 0) { error = ps->ps_error; } mutex_destroy(&ps->ps_lock); cv_destroy(&ps->ps_cv); kmem_free(ps, sizeof(*ps)); /* * [clean up the state of the buffer] * Remember if somebody wants it, so we can wake them up below. * Also, if we had to steal it, give it back. */ if (obp != NULL) { KASSERT((obp->b_cflags & BC_BUSY) != 0); /* * [if another process is waiting for the raw I/O buffer, * wake up processes waiting to do physical I/O; */ mutex_enter(&bufcache_lock); obp->b_cflags &= ~(BC_BUSY | BC_WANTED); obp->b_flags &= ~(B_PHYS | B_RAW); obp->b_iodone = NULL; cv_broadcast(&obp->b_busy); mutex_exit(&bufcache_lock); } uvm_lwp_rele(l); DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n", __func__, uio->uio_offset, uio->uio_resid)); return error; }
/* * Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly * from the raw device to user buffers, and bypasses the buffer cache. * * Comments in brackets are from Leffler, et al.'s pseudo-code implementation. */ int physio(void (*strategy)(struct buf *), struct buf *bp, dev_t dev, int flags, void (*minphys)(struct buf *), struct uio *uio) { struct iovec *iovp; struct proc *p = curproc; int error, done, i, nobuf, s, todo; error = 0; flags &= B_READ | B_WRITE; /* Make sure we have a buffer, creating one if necessary. */ if ((nobuf = (bp == NULL)) != 0) bp = getphysbuf(); /* [raise the processor priority level to splbio;] */ s = splbio(); /* [while the buffer is marked busy] */ while (bp->b_flags & B_BUSY) { /* [mark the buffer wanted] */ bp->b_flags |= B_WANTED; /* [wait until the buffer is available] */ tsleep(bp, PRIBIO+1, "physbuf", 0); } /* Mark it busy, so nobody else will use it. */ bp->b_flags |= B_BUSY; /* [lower the priority level] */ splx(s); /* [set up the fixed part of the buffer for a transfer] */ bp->b_dev = dev; bp->b_error = 0; bp->b_proc = p; LIST_INIT(&bp->b_dep); /* * [while there are data to transfer and no I/O error] * Note that I/O errors are handled with a 'goto' at the bottom * of the 'while' loop. */ for (i = 0; i < uio->uio_iovcnt; i++) { iovp = &uio->uio_iov[i]; while (iovp->iov_len > 0) { /* * [mark the buffer busy for physical I/O] * (i.e. set B_PHYS (because it's an I/O to user * memory), and B_RAW, because B_RAW is to be * "Set by physio for raw transfers.", in addition * to the "busy" and read/write flag.) */ bp->b_flags = B_BUSY | B_PHYS | B_RAW | flags; /* [set up the buffer for a maximum-sized transfer] */ bp->b_blkno = btodb(uio->uio_offset); bp->b_data = iovp->iov_base; /* * Because iov_len is unsigned but b_bcount is signed, * an overflow is possible. Therefore bound to MAXPHYS * before calling minphys. */ if (iovp->iov_len > MAXPHYS) bp->b_bcount = MAXPHYS; else bp->b_bcount = iovp->iov_len; /* * [call minphys to bound the transfer size] * and remember the amount of data to transfer, * for later comparison. */ (*minphys)(bp); todo = bp->b_bcount; #ifdef DIAGNOSTIC if (todo < 0) panic("todo < 0; minphys broken"); if (todo > MAXPHYS) panic("todo > MAXPHYS; minphys broken"); #endif /* * [lock the part of the user address space involved * in the transfer] * Beware vmapbuf(); it clobbers b_data and * saves it in b_saveaddr. However, vunmapbuf() * restores it. */ error = uvm_vslock(p, bp->b_data, todo, (flags & B_READ) ? VM_PROT_READ | VM_PROT_WRITE : VM_PROT_READ); if (error) { bp->b_flags |= B_ERROR; bp->b_error = error; goto after_unlock; } vmapbuf(bp, todo); /* [call strategy to start the transfer] */ (*strategy)(bp); /* * Note that the raise/wait/lower/get error * steps below would be done by biowait(), but * we want to unlock the address space before * we lower the priority. * * [raise the priority level to splbio] */ s = splbio(); /* [wait for the transfer to complete] */ while ((bp->b_flags & B_DONE) == 0) tsleep(bp, PRIBIO + 1, "physio", 0); /* Mark it busy again, so nobody else will use it. */ bp->b_flags |= B_BUSY; /* [lower the priority level] */ splx(s); /* * [unlock the part of the address space previously * locked] */ vunmapbuf(bp, todo); uvm_vsunlock(p, bp->b_data, todo); after_unlock: /* remember error value (save a splbio/splx pair) */ if (bp->b_flags & B_ERROR) error = (bp->b_error ? bp->b_error : EIO); /* * [deduct the transfer size from the total number * of data to transfer] */ done = bp->b_bcount - bp->b_resid; #ifdef DIAGNOSTIC if (done < 0) panic("done < 0; strategy broken"); if (done > todo) panic("done > todo; strategy broken"); #endif iovp->iov_len -= done; iovp->iov_base = (caddr_t)iovp->iov_base + done; uio->uio_offset += done; uio->uio_resid -= done; /* * Now, check for an error. * Also, handle weird end-of-disk semantics. */ if (error || done < todo) goto done; } } done: /* * [clean up the state of the buffer] * Remember if somebody wants it, so we can wake them up below. * Also, if we had to steal it, give it back. */ s = splbio(); bp->b_flags &= ~(B_BUSY | B_PHYS | B_RAW); if (nobuf) putphysbuf(bp); else { /* * [if another process is waiting for the raw I/O buffer, * wake up processes waiting to do physical I/O] */ if (bp->b_flags & B_WANTED) { bp->b_flags &= ~B_WANTED; wakeup(bp); } } splx(s); return (error); }
/* ARGSUSED */ int sys_mincore(struct proc *p, void *v, register_t *retval) { struct sys_mincore_args /* { syscallarg(void *) addr; syscallarg(size_t) len; syscallarg(char *) vec; } */ *uap = v; vm_page_t m; char *vec, pgi; struct uvm_object *uobj; struct vm_amap *amap; struct vm_anon *anon; vm_map_entry_t entry; vaddr_t start, end, lim; vm_map_t map; vsize_t len, npgs; int error = 0; map = &p->p_vmspace->vm_map; start = (vaddr_t)SCARG(uap, addr); len = SCARG(uap, len); vec = SCARG(uap, vec); if (start & PAGE_MASK) return (EINVAL); len = round_page(len); end = start + len; if (end <= start) return (EINVAL); npgs = len >> PAGE_SHIFT; /* * Lock down vec, so our returned status isn't outdated by * storing the status byte for a page. */ if ((error = uvm_vslock(p, vec, npgs, VM_PROT_WRITE)) != 0) return (error); vm_map_lock_read(map); if (uvm_map_lookup_entry(map, start, &entry) == FALSE) { error = ENOMEM; goto out; } for (/* nothing */; entry != &map->header && entry->start < end; entry = entry->next) { KASSERT(!UVM_ET_ISSUBMAP(entry)); KASSERT(start >= entry->start); /* Make sure there are no holes. */ if (entry->end < end && (entry->next == &map->header || entry->next->start > entry->end)) { error = ENOMEM; goto out; } lim = end < entry->end ? end : entry->end; /* * Special case for objects with no "real" pages. Those * are always considered resident (mapped devices). */ if (UVM_ET_ISOBJ(entry)) { KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)); if (entry->object.uvm_obj->pgops->pgo_releasepg == NULL) { pgi = 1; for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) copyout(&pgi, vec, sizeof(char)); continue; } } amap = entry->aref.ar_amap; /* top layer */ uobj = entry->object.uvm_obj; /* bottom layer */ if (uobj != NULL) simple_lock(&uobj->vmobjlock); for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) { pgi = 0; if (amap != NULL) { /* Check the top layer first. */ anon = amap_lookup(&entry->aref, start - entry->start); /* Don't need to lock anon here. */ if (anon != NULL && anon->an_page != NULL) { /* * Anon has the page for this entry * offset. */ pgi = 1; } } if (uobj != NULL && pgi == 0) { /* Check the bottom layer. */ m = uvm_pagelookup(uobj, entry->offset + (start - entry->start)); if (m != NULL) { /* * Object has the page for this entry * offset. */ pgi = 1; } } copyout(&pgi, vec, sizeof(char)); } if (uobj != NULL) simple_unlock(&uobj->vmobjlock); } out: vm_map_unlock_read(map); uvm_vsunlock(p, SCARG(uap, vec), npgs); return (error); }