/* * Set redzones and remember allocation backtrace. */ void * redzone_setup(caddr_t raddr, u_long nsize) { struct stack st; caddr_t haddr, faddr; atomic_add_long(&redzone_extra_mem, redzone_size_ntor(nsize) - nsize); haddr = raddr + redzone_roundup(nsize) - REDZONE_HSIZE; faddr = haddr + REDZONE_HSIZE + nsize; /* Redzone header. */ stack_save(&st); bcopy(&st, haddr, sizeof(st)); haddr += sizeof(st); bcopy(&nsize, haddr, sizeof(nsize)); haddr += sizeof(nsize); memset(haddr, 0x42, REDZONE_CHSIZE); haddr += REDZONE_CHSIZE; /* Redzone footer. */ memset(faddr, 0x42, REDZONE_CFSIZE); return (haddr); }
/* * Drop an inode reference, freeing the inode when the last reference goes * away. */ void hammer2_inode_drop(hammer2_inode_t *ip) { hammer2_pfs_t *pmp; u_int refs; while (ip) { if (hammer2_debug & 0x80000) { kprintf("INODE-1 %p (%d->%d)\n", ip, ip->refs, ip->refs - 1); print_backtrace(8); } refs = ip->refs; cpu_ccfence(); if (refs == 1) { /* * Transition to zero, must interlock with * the inode inumber lookup tree (if applicable). * It should not be possible for anyone to race * the transition to 0. */ pmp = ip->pmp; KKASSERT(pmp); hammer2_spin_ex(&pmp->inum_spin); if (atomic_cmpset_int(&ip->refs, 1, 0)) { KKASSERT(hammer2_mtx_refs(&ip->lock) == 0); if (ip->flags & HAMMER2_INODE_ONRBTREE) { atomic_clear_int(&ip->flags, HAMMER2_INODE_ONRBTREE); RB_REMOVE(hammer2_inode_tree, &pmp->inum_tree, ip); --pmp->inum_count; } hammer2_spin_unex(&pmp->inum_spin); ip->pmp = NULL; /* * Cleaning out ip->cluster isn't entirely * trivial. */ hammer2_inode_repoint(ip, NULL, NULL); kfree(ip, pmp->minode); atomic_add_long(&pmp->inmem_inodes, -1); ip = NULL; /* will terminate loop */ } else { hammer2_spin_unex(&ip->pmp->inum_spin); } } else { /* * Non zero transition */ if (atomic_cmpset_int(&ip->refs, refs, refs - 1)) break; } } }
static void loopdebug(const char *msg, pmap_inval_info_t *info) { int p; int cpu = mycpu->gd_cpuid; /* * Don't kprintf() anything if the pmap inval watchdog gets hit. * DRM can cause an occassional watchdog hit (at least with a 1/16 * second watchdog), and attempting to kprintf to the KVM frame buffer * from Xinvltlb, which ignores critical sections, can implode the * system. */ if (pmap_inval_watchdog_print == 0) return; cpu_lfence(); #ifdef LOOPRECOVER atomic_add_long(&smp_smurf_mask.ary[0], 0); #endif kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx " #ifdef LOOPRECOVER "s=%08jx " #endif #ifdef LOOPMASK_IN "in=%08jx " #endif #ifdef LOOPRECOVER "smurf=%08jx\n" #endif , msg, cpu, info->mode, info->mask.ary[0], info->done.ary[0] #ifdef LOOPRECOVER , info->sigmask.ary[0] #endif #ifdef LOOPMASK_IN , smp_in_mask.ary[0] #endif #ifdef LOOPRECOVER , smp_smurf_mask.ary[0] #endif ); kprintf("mdglob "); for (p = 0; p < ncpus; ++p) kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb); kprintf("\n"); }
int chgsbsize(struct uidinfo *uip, u_long *hiwat, u_long to, rlim_t xmax) { rlim_t nsb; const long diff = to - *hiwat; nsb = (rlim_t)atomic_add_long_nv((long *)&uip->ui_sbsize, diff); if (diff > 0 && nsb > xmax) { atomic_add_long((long *)&uip->ui_sbsize, -diff); return 0; } *hiwat = to; KASSERT(nsb >= 0); return 1; }
/* * balloon_alloc_pages() * Allocate page_cnt mfns. mfns storage provided by the caller. Returns * the number of pages allocated, which could be less than page_cnt, or * a negative number if an error occurred. */ long balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns) { xen_memory_reservation_t memres; long rv; bzero(&memres, sizeof (memres)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memres.extent_start, mfns); memres.domid = DOMID_SELF; memres.nr_extents = page_cnt; rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); if (rv > 0) atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv); return (rv); }
static void intr_stray_handler(void *cookie) { struct intr_handler *ih; ih = (struct intr_handler *)cookie; if (intr_stray_count[ih->ih_irq] < MAX_STRAY_LOG) { printf("stray irq %d\n", ih->ih_irq); atomic_add_long(&intr_stray_count[ih->ih_irq], 1); if (intr_stray_count[ih->ih_irq] >= MAX_STRAY_LOG) printf("got %d stray irq %d's: not logging anymore\n", MAX_STRAY_LOG, ih->ih_irq); } }
/* * Allocate zeroed memory if tmpfs_maxkmem has not been exceeded * or the 'musthave' flag is set. 'musthave' allocations should * always be subordinate to normal allocations so that tmpfs_maxkmem * can't be exceeded by more than a few KB. Example: when creating * a new directory, the tmpnode is a normal allocation; if that * succeeds, the dirents for "." and ".." are 'musthave' allocations. */ void * tmp_memalloc(size_t size, int musthave) { static time_t last_warning; time_t now; if (atomic_add_long_nv(&tmp_kmemspace, size) < tmpfs_maxkmem || musthave) return (kmem_zalloc(size, KM_SLEEP)); atomic_add_long(&tmp_kmemspace, -size); now = gethrestime_sec(); if (last_warning != now) { last_warning = now; cmn_err(CE_WARN, "tmp_memalloc: tmpfs over memory limit"); } return (NULL); }
int px_fdvma_release(dev_info_t *dip, px_t *px_p, ddi_dma_impl_t *mp) { px_mmu_t *mmu_p = px_p->px_mmu_p; size_t npages; fdvma_t *fdvma_p = (fdvma_t *)mp->dmai_fdvma; if (px_disable_fdvma) return (DDI_FAILURE); /* validate fdvma handle */ if (!(mp->dmai_rflags & DMP_BYPASSNEXUS)) { DBG(DBG_DMA_CTL, dip, "DDI_DMA_RELEASE: not fast dma\n"); return (DDI_FAILURE); } /* flush all reserved dvma addresses from mmu */ px_mmu_unmap_window(mmu_p, mp); npages = mp->dmai_ndvmapages; vmem_xfree(mmu_p->mmu_dvma_map, (void *)mp->dmai_mapping, MMU_PTOB(npages)); atomic_add_long(&mmu_p->mmu_dvma_reserve, npages); mp->dmai_ndvmapages = 0; /* see if there is anyone waiting for dvma space */ if (mmu_p->mmu_dvma_clid != 0) { DBG(DBG_DMA_CTL, dip, "run dvma callback\n"); ddi_run_callback(&mmu_p->mmu_dvma_clid); } /* free data structures */ kmem_free(fdvma_p->pagecnt, npages * sizeof (uint_t)); kmem_free(fdvma_p, sizeof (fdvma_t)); kmem_free(mp, sizeof (px_dma_hdl_t)); /* see if there is anyone waiting for kmem */ if (px_kmem_clid != 0) { DBG(DBG_DMA_CTL, dip, "run handle callback\n"); ddi_run_callback(&px_kmem_clid); } return (DDI_SUCCESS); }
struct cdevsw * devvn_refthread(struct vnode *vp, struct cdev **devp, int *ref) { struct cdevsw *csw; struct cdev_priv *cdp; struct cdev *dev; mtx_assert(&devmtx, MA_NOTOWNED); if ((vp->v_vflag & VV_ETERNALDEV) != 0) { dev = vp->v_rdev; if (dev == NULL) return (NULL); KASSERT((dev->si_flags & SI_ETERNAL) != 0, ("Not eternal cdev")); *ref = 0; csw = dev->si_devsw; KASSERT(csw != NULL, ("Eternal cdev is destroyed")); *devp = dev; return (csw); } csw = NULL; dev_lock(); dev = vp->v_rdev; if (dev == NULL) { dev_unlock(); return (NULL); } cdp = cdev2priv(dev); if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0) { csw = dev->si_devsw; if (csw != NULL) atomic_add_long(&dev->si_threadcount, 1); } dev_unlock(); if (csw != NULL) { *devp = dev; *ref = 1; } return (csw); }
/* * This routine destroys all the resources of an rnode * and finally the rnode itself. */ static void destroy_rnode4(rnode4_t *rp) { vnode_t *vp; vfs_t *vfsp; ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE); vp = RTOV4(rp); vfsp = vp->v_vfsp; uninit_rnode4(rp); atomic_add_long((ulong_t *)&rnode4_new, -1); #ifdef DEBUG clstat4_debug.nrnode.value.ui64--; #endif kmem_cache_free(rnode4_cache, rp); vn_invalid(vp); vn_free(vp); VFS_RELE(vfsp); }
struct cdevsw * dev_refthread(struct cdev *dev, int *ref) { struct cdevsw *csw; struct cdev_priv *cdp; mtx_assert(&devmtx, MA_NOTOWNED); if ((dev->si_flags & SI_ETERNAL) != 0) { *ref = 0; return (dev->si_devsw); } dev_lock(); csw = dev->si_devsw; if (csw != NULL) { cdp = cdev2priv(dev); if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0) atomic_add_long(&dev->si_threadcount, 1); else csw = NULL; } dev_unlock(); *ref = 1; return (csw); }
/* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_GETPAGES. */ int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg) { vm_object_t object; struct bufobj *bo; struct buf *bp; off_t foff; #ifdef INVARIANTS off_t blkno0; #endif int bsize, pagesperblock, *freecnt; int error, before, after, rbehind, rahead, poff, i; int bytecount, secmask; KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("%s does not support devices", __func__)); if (vp->v_iflag & VI_DOOMED) return (VM_PAGER_BAD); object = vp->v_object; foff = IDX_TO_OFF(m[0]->pindex); bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; KASSERT(foff < object->un_pager.vnp.vnp_size, ("%s: page %p offset beyond vp %p size", __func__, m[0], vp)); KASSERT(count <= sizeof(bp->b_pages), ("%s: requested %d pages", __func__, count)); /* * The last page has valid blocks. Invalid part can only * exist at the end of file, and the page is made fully valid * by zeroing in vm_pager_get_pages(). */ if (m[count - 1]->valid != 0 && --count == 0) { if (iodone != NULL) iodone(arg, m, 1, 0); return (VM_PAGER_OK); } /* * Synchronous and asynchronous paging operations use different * free pbuf counters. This is done to avoid asynchronous requests * to consume all pbufs. * Allocate the pbuf at the very beginning of the function, so that * if we are low on certain kind of pbufs don't even proceed to BMAP, * but sleep. */ freecnt = iodone != NULL ? &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt; bp = getpbuf(freecnt); /* * Get the underlying device blocks for the file with VOP_BMAP(). * If the file system doesn't support VOP_BMAP, use old way of * getting pages via VOP_READ. */ error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before); if (error == EOPNOTSUPP) { relpbuf(bp, freecnt); VM_OBJECT_WLOCK(object); for (i = 0; i < count; i++) { PCPU_INC(cnt.v_vnodein); PCPU_INC(cnt.v_vnodepgsin); error = vnode_pager_input_old(object, m[i]); if (error) break; } VM_OBJECT_WUNLOCK(object); return (error); } else if (error != 0) { relpbuf(bp, freecnt); return (VM_PAGER_ERROR); } /* * If the file system supports BMAP, but blocksize is smaller * than a page size, then use special small filesystem code. */ if (pagesperblock == 0) { relpbuf(bp, freecnt); for (i = 0; i < count; i++) { PCPU_INC(cnt.v_vnodein); PCPU_INC(cnt.v_vnodepgsin); error = vnode_pager_input_smlfs(object, m[i]); if (error) break; } return (error); } /* * A sparse file can be encountered only for a single page request, * which may not be preceded by call to vm_pager_haspage(). */ if (bp->b_blkno == -1) { KASSERT(count == 1, ("%s: array[%d] request to a sparse file %p", __func__, count, vp)); relpbuf(bp, freecnt); pmap_zero_page(m[0]); KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty", __func__, m[0])); VM_OBJECT_WLOCK(object); m[0]->valid = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(object); return (VM_PAGER_OK); } #ifdef INVARIANTS blkno0 = bp->b_blkno; #endif bp->b_blkno += (foff % bsize) / DEV_BSIZE; /* Recalculate blocks available after/before to pages. */ poff = (foff % bsize) / PAGE_SIZE; before *= pagesperblock; before += poff; after *= pagesperblock; after += pagesperblock - (poff + 1); if (m[0]->pindex + after >= object->size) after = object->size - 1 - m[0]->pindex; KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d", __func__, count, after + 1)); after -= count - 1; /* Trim requested rbehind/rahead to possible values. */ rbehind = a_rbehind ? *a_rbehind : 0; rahead = a_rahead ? *a_rahead : 0; rbehind = min(rbehind, before); rbehind = min(rbehind, m[0]->pindex); rahead = min(rahead, after); rahead = min(rahead, object->size - m[count - 1]->pindex); /* * Check that total amount of pages fit into buf. Trim rbehind and * rahead evenly if not. */ if (rbehind + rahead + count > nitems(bp->b_pages)) { int trim, sum; trim = rbehind + rahead + count - nitems(bp->b_pages) + 1; sum = rbehind + rahead; if (rbehind == before) { /* Roundup rbehind trim to block size. */ rbehind -= roundup(trim * rbehind / sum, pagesperblock); if (rbehind < 0) rbehind = 0; } else rbehind -= trim * rbehind / sum; rahead -= trim * rahead / sum; } KASSERT(rbehind + rahead + count <= nitems(bp->b_pages), ("%s: behind %d ahead %d count %d", __func__, rbehind, rahead, count)); /* * Fill in the bp->b_pages[] array with requested and optional * read behind or read ahead pages. Read behind pages are looked * up in a backward direction, down to a first cached page. Same * for read ahead pages, but there is no need to shift the array * in case of encountering a cached page. */ i = bp->b_npages = 0; if (rbehind) { vm_pindex_t startpindex, tpindex; vm_page_t p; VM_OBJECT_WLOCK(object); startpindex = m[0]->pindex - rbehind; if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL && p->pindex >= startpindex) startpindex = p->pindex + 1; /* tpindex is unsigned; beware of numeric underflow. */ for (tpindex = m[0]->pindex - 1; tpindex >= startpindex && tpindex < m[0]->pindex; tpindex--, i++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) { /* Shift the array. */ for (int j = 0; j < i; j++) bp->b_pages[j] = bp->b_pages[j + tpindex + 1 - startpindex]; break; } bp->b_pages[tpindex - startpindex] = p; } bp->b_pgbefore = i; bp->b_npages += i; bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE; } else bp->b_pgbefore = 0; /* Requested pages. */ for (int j = 0; j < count; j++, i++) bp->b_pages[i] = m[j]; bp->b_npages += count; if (rahead) { vm_pindex_t endpindex, tpindex; vm_page_t p; if (!VM_OBJECT_WOWNED(object)) VM_OBJECT_WLOCK(object); endpindex = m[count - 1]->pindex + rahead + 1; if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL && p->pindex < endpindex) endpindex = p->pindex; if (endpindex > object->size) endpindex = object->size; for (tpindex = m[count - 1]->pindex + 1; tpindex < endpindex; i++, tpindex++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) break; bp->b_pages[i] = p; } bp->b_pgafter = i - bp->b_npages; bp->b_npages = i; } else bp->b_pgafter = 0; if (VM_OBJECT_WOWNED(object)) VM_OBJECT_WUNLOCK(object); /* Report back actual behind/ahead read. */ if (a_rbehind) *a_rbehind = bp->b_pgbefore; if (a_rahead) *a_rahead = bp->b_pgafter; #ifdef INVARIANTS KASSERT(bp->b_npages <= nitems(bp->b_pages), ("%s: buf %p overflowed", __func__, bp)); for (int j = 1; j < bp->b_npages; j++) KASSERT(bp->b_pages[j]->pindex - 1 == bp->b_pages[j - 1]->pindex, ("%s: pages array not consecutive, bp %p", __func__, bp)); #endif /* * Recalculate first offset and bytecount with regards to read behind. * Truncate bytecount to vnode real size and round up physical size * for real devices. */ foff = IDX_TO_OFF(bp->b_pages[0]->pindex); bytecount = bp->b_npages << PAGE_SHIFT; if ((foff + bytecount) > object->un_pager.vnp.vnp_size) bytecount = object->un_pager.vnp.vnp_size - foff; secmask = bo->bo_bsize - 1; KASSERT(secmask < PAGE_SIZE && secmask > 0, ("%s: sector size %d too large", __func__, secmask + 1)); bytecount = (bytecount + secmask) & ~secmask; /* * And map the pages to be read into the kva, if the filesystem * requires mapped buffers. */ if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } /* Build a minimal buffer header. */ bp->b_iocmd = BIO_READ; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount; bp->b_iooffset = dbtob(bp->b_blkno); KASSERT(IDX_TO_OFF(m[0]->pindex - bp->b_pages[0]->pindex) == (blkno0 - bp->b_blkno) * DEV_BSIZE + IDX_TO_OFF(m[0]->pindex) % bsize, ("wrong offsets bsize %d m[0] %ju b_pages[0] %ju " "blkno0 %ju b_blkno %ju", bsize, (uintmax_t)m[0]->pindex, (uintmax_t)bp->b_pages[0]->pindex, (uintmax_t)blkno0, (uintmax_t)bp->b_blkno)); atomic_add_long(&runningbufspace, bp->b_runningbufspace); PCPU_INC(cnt.v_vnodein); PCPU_ADD(cnt.v_vnodepgsin, bp->b_npages); if (iodone != NULL) { /* async */ bp->b_pgiodone = iodone; bp->b_caller1 = arg; bp->b_iodone = vnode_pager_generic_getpages_done_async; bp->b_flags |= B_ASYNC; BUF_KERNPROC(bp); bstrategy(bp); return (VM_PAGER_OK); } else { bp->b_iodone = bdone; bstrategy(bp); bwait(bp, PVM, "vnread"); error = vnode_pager_generic_getpages_done(bp); for (i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_pbuf_freecnt); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } }
void tmp_memfree(void *cp, size_t size) { kmem_free(cp, size); atomic_add_long(&tmp_kmemspace, -size); }
/* * small block filesystem vnode pager input */ static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m) { struct vnode *vp; struct bufobj *bo; struct buf *bp; struct sf_buf *sf; daddr_t fileaddr; vm_offset_t bsize; vm_page_bits_t bits; int error, i; error = 0; vp = object->handle; if (vp->v_iflag & VI_DOOMED) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; VOP_BMAP(vp, 0, &bo, 0, NULL, NULL); sf = sf_buf_alloc(m, 0); for (i = 0; i < PAGE_SIZE / bsize; i++) { vm_ooffset_t address; bits = vm_page_bits(i * bsize, bsize); if (m->valid & bits) continue; address = IDX_TO_OFF(m->pindex) + i * bsize; if (address >= object->un_pager.vnp.vnp_size) { fileaddr = -1; } else { error = vnode_pager_addr(vp, address, &fileaddr, NULL); if (error) break; } if (fileaddr != -1) { bp = getpbuf(&vnode_pbuf_freecnt); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; bp->b_iodone = bdone; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize; bp->b_blkno = fileaddr; pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bsize; bp->b_bufsize = bsize; bp->b_runningbufspace = bp->b_bufsize; atomic_add_long(&runningbufspace, bp->b_runningbufspace); /* do the input */ bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); bwait(bp, PVM, "vnsrd"); if ((bp->b_ioflags & BIO_ERROR) != 0) error = EIO; /* * free the buffer header back to the swap buffer pool */ bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_pbuf_freecnt); if (error) break; } else bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize); KASSERT((m->dirty & bits) == 0, ("vnode_pager_input_smlfs: page %p is dirty", m)); VM_OBJECT_WLOCK(object); m->valid |= bits; VM_OBJECT_WUNLOCK(object); } sf_buf_free(sf); if (error) { return VM_PAGER_ERROR; } return VM_PAGER_OK; }
/* * Attempt to acquire a shared or exclusive token. Returns TRUE on success, * FALSE on failure. * * If TOK_EXCLUSIVE is set in mode we are attempting to get an exclusive * token, otherwise are attempting to get a shared token. * * If TOK_EXCLREQ is set in mode this is a blocking operation, otherwise * it is a non-blocking operation (for both exclusive or shared acquisions). */ static __inline int _lwkt_trytokref(lwkt_tokref_t ref, thread_t td, long mode) { lwkt_token_t tok; lwkt_tokref_t oref; long count; tok = ref->tr_tok; KASSERT(((mode & TOK_EXCLREQ) == 0 || /* non blocking */ td->td_gd->gd_intr_nesting_level == 0 || panic_cpu_gd == mycpu), ("Attempt to acquire token %p not already " "held in hard code section", tok)); if (mode & TOK_EXCLUSIVE) { /* * Attempt to get an exclusive token */ for (;;) { count = tok->t_count; oref = tok->t_ref; /* can be NULL */ cpu_ccfence(); if ((count & ~TOK_EXCLREQ) == 0) { /* * It is possible to get the exclusive bit. * We must clear TOK_EXCLREQ on successful * acquisition. */ if (atomic_cmpset_long(&tok->t_count, count, (count & ~TOK_EXCLREQ) | TOK_EXCLUSIVE)) { KKASSERT(tok->t_ref == NULL); tok->t_ref = ref; return TRUE; } /* retry */ } else if ((count & TOK_EXCLUSIVE) && oref >= &td->td_toks_base && oref < td->td_toks_stop) { /* * Our thread already holds the exclusive * bit, we treat this tokref as a shared * token (sorta) to make the token release * code easier. * * NOTE: oref cannot race above if it * happens to be ours, so we're good. * But we must still have a stable * variable for both parts of the * comparison. * * NOTE: Since we already have an exclusive * lock and don't need to check EXCLREQ * we can just use an atomic_add here */ atomic_add_long(&tok->t_count, TOK_INCR); ref->tr_count &= ~TOK_EXCLUSIVE; return TRUE; } else if ((mode & TOK_EXCLREQ) && (count & TOK_EXCLREQ) == 0) { /* * Unable to get the exclusive bit but being * asked to set the exclusive-request bit. * Since we are going to retry anyway just * set the bit unconditionally. */ atomic_set_long(&tok->t_count, TOK_EXCLREQ); return FALSE; } else { /* * Unable to get the exclusive bit and not * being asked to set the exclusive-request * (aka lwkt_trytoken()), or EXCLREQ was * already set. */ cpu_pause(); return FALSE; } /* retry */ } } else { /* * Attempt to get a shared token. Note that TOK_EXCLREQ * for shared tokens simply means the caller intends to * block. We never actually set the bit in tok->t_count. */ for (;;) { count = tok->t_count; oref = tok->t_ref; /* can be NULL */ cpu_ccfence(); if ((count & (TOK_EXCLUSIVE/*|TOK_EXCLREQ*/)) == 0) { /* * It may be possible to get the token shared. */ if ((atomic_fetchadd_long(&tok->t_count, TOK_INCR) & TOK_EXCLUSIVE) == 0) { return TRUE; } atomic_fetchadd_long(&tok->t_count, -TOK_INCR); /* retry */ } else if ((count & TOK_EXCLUSIVE) && oref >= &td->td_toks_base && oref < td->td_toks_stop) { /* * We own the exclusive bit on the token so * we can in fact also get it shared. */ atomic_add_long(&tok->t_count, TOK_INCR); return TRUE; } else { /* * We failed to get the token shared */ return FALSE; } /* retry */ } } }
/* * Disable logging */ int lqfs_disable(vnode_t *vp, struct fiolog *flp) { int error = 0; inode_t *ip = VTOI(vp); qfsvfs_t *qfsvfsp = ip->i_qfsvfs; fs_lqfs_common_t *fs = VFS_FS_PTR(qfsvfsp); #ifdef LUFS struct lockfs lf; struct ulockfs *ulp; #else /* QFS doesn't really support LOCKFS. */ #endif /* LUFS */ flp->error = FIOLOG_ENONE; /* * Logging is already disabled; done */ if (LQFS_GET_LOGBNO(fs) == 0 || LQFS_GET_LOGP(qfsvfsp) == NULL || !LQFS_CAPABLE(qfsvfsp)) { vfs_setmntopt(qfsvfsp->vfs_vfs, MNTOPT_NOLOGGING, NULL, 0); error = 0; goto out; } #ifdef LUFS /* * File system must be write locked to disable logging */ error = qfs_fiolfss(vp, &lf); if (error) { goto out; } if (!LOCKFS_IS_ULOCK(&lf)) { flp->error = FIOLOG_EULOCK; error = 0; goto out; } lf.lf_lock = LOCKFS_WLOCK; lf.lf_flags = 0; lf.lf_comment = NULL; error = qfs_fiolfs(vp, &lf, 1); if (error) { flp->error = FIOLOG_EWLOCK; error = 0; goto out; } #else /* QFS doesn't really support LOCKFS. */ #endif /* LUFS */ if (LQFS_GET_LOGP(qfsvfsp) == NULL || LQFS_GET_LOGBNO(fs) == 0) { goto errout; } /* * WE ARE COMMITTED TO DISABLING LOGGING PAST THIS POINT */ /* * Disable logging: * Suspend the reclaim thread and force the delete thread to exit. * When a nologging mount has completed there may still be * work for reclaim to do so just suspend this thread until * it's [deadlock-] safe for it to continue. The delete * thread won't be needed as qfs_iinactive() calls * qfs_delete() when logging is disabled. * Freeze and drain reader ops. * Commit any outstanding reader transactions (lqfs_flush). * Set the ``unmounted'' bit in the qfstrans struct. * If debug, remove metadata from matamap. * Disable matamap processing. * NULL the trans ops table. * Free all of the incore structs related to logging. * Allow reader ops. */ #ifdef LUFS qfs_thread_suspend(&qfsvfsp->vfs_reclaim); qfs_thread_exit(&qfsvfsp->vfs_delete); #else /* QFS doesn't have file reclaim nor i-node delete threads. */ #endif /* LUFS */ vfs_lock_wait(qfsvfsp->vfs_vfs); #ifdef LQFS_TODO_LOCKFS ulp = &qfsvfsp->vfs_ulockfs; mutex_enter(&ulp->ul_lock); (void) qfs_quiesce(ulp); #else /* QFS doesn't really support LOCKFS. */ #endif /* LQFS_TODO_LOCKFS */ #ifdef LQFS_TODO (void) qfs_flush(qfsvfsp->vfs_vfs); #else (void) lqfs_flush(qfsvfsp); if (LQFS_GET_LOGP(qfsvfsp)) { logmap_start_roll(LQFS_GET_LOGP(qfsvfsp)); } #endif /* LQFS_TODO */ TRANS_MATA_UMOUNT(qfsvfsp); LQFS_SET_DOMATAMAP(qfsvfsp, 0); /* * Free all of the incore structs * Aquire the ufs_scan_lock before de-linking the mtm data * structure so that we keep ufs_sync() and ufs_update() away * when they execute the ufs_scan_inodes() run while we're in * progress of enabling/disabling logging. */ mutex_enter(&qfs_scan_lock); (void) lqfs_unsnarf(qfsvfsp); mutex_exit(&qfs_scan_lock); #ifdef LQFS_TODO_LOCKFS atomic_add_long(&ufs_quiesce_pend, -1); mutex_exit(&ulp->ul_lock); #else /* QFS doesn't do this yet. */ #endif /* LQFS_TODO_LOCKFS */ vfs_setmntopt(qfsvfsp->vfs_vfs, MNTOPT_NOLOGGING, NULL, 0); vfs_unlock(qfsvfsp->vfs_vfs); LQFS_SET_FS_ROLLED(fs, FS_ALL_ROLLED); LQFS_SET_NOLOG_SI(qfsvfsp, 0); /* * Free the log space and mark the superblock as FSACTIVE */ (void) lqfs_free(qfsvfsp); #ifdef LUFS /* * Allow the reclaim thread to continue. */ qfs_thread_continue(&qfsvfsp->vfs_reclaim); #else /* QFS doesn't have a file reclaim thread. */ #endif /* LUFS */ #ifdef LQFS_TODO_LOCKFS /* * Unlock the file system */ lf.lf_lock = LOCKFS_ULOCK; lf.lf_flags = 0; error = qfs_fiolfs(vp, &lf, 1); if (error) { flp->error = FIOLOG_ENOULOCK; } #else /* QFS doesn't really support LOCKFS. */ #endif /* LQFS_LOCKFS */ error = 0; goto out; errout: #ifdef LQFS_LOCKFS lf.lf_lock = LOCKFS_ULOCK; lf.lf_flags = 0; (void) qfs_fiolfs(vp, &lf, 1); #else /* QFS doesn't really support LOCKFS. */ #endif /* LQFS_LOCKFS */ out: mutex_enter(&ip->mp->ms.m_waitwr_mutex); ip->mp->mt.fi_status |= FS_LOGSTATE_KNOWN; mutex_exit(&ip->mp->ms.m_waitwr_mutex); return (error); }
/* * Resizes the aobj associated to the regular file pointed to by vp to * the size newsize. 'vp' must point to a vnode that represents a regular * file. 'newsize' must be positive. * * pass trivial as 1 when buf content will be overwritten, otherwise set 0 * to be zero filled. * * Returns zero on success or an appropriate error code on failure. * * Caller must hold the node exclusively locked. */ int tmpfs_reg_resize(struct vnode *vp, off_t newsize, int trivial) { int error; vm_pindex_t newpages, oldpages; struct tmpfs_mount *tmp; struct tmpfs_node *node; off_t oldsize; #ifdef INVARIANTS KKASSERT(vp->v_type == VREG); KKASSERT(newsize >= 0); #endif node = VP_TO_TMPFS_NODE(vp); tmp = VFS_TO_TMPFS(vp->v_mount); /* * Convert the old and new sizes to the number of pages needed to * store them. It may happen that we do not need to do anything * because the last allocated page can accommodate the change on * its own. */ oldsize = node->tn_size; oldpages = round_page64(oldsize) / PAGE_SIZE; KKASSERT(oldpages == node->tn_reg.tn_aobj_pages); newpages = round_page64(newsize) / PAGE_SIZE; if (newpages > oldpages && tmp->tm_pages_used + newpages - oldpages > tmp->tm_pages_max) { error = ENOSPC; goto out; } node->tn_reg.tn_aobj_pages = newpages; node->tn_size = newsize; if (newpages != oldpages) atomic_add_long(&tmp->tm_pages_used, (newpages - oldpages)); /* * When adjusting the vnode filesize and its VM object we must * also adjust our backing VM object (aobj). The blocksize * used must match the block sized we use for the buffer cache. * * The backing VM object may contain VM pages as well as swap * assignments if we previously renamed main object pages into * it during deactivation. */ if (newsize < oldsize) { vm_pindex_t osize; vm_pindex_t nsize; vm_object_t aobj; error = nvtruncbuf(vp, newsize, TMPFS_BLKSIZE, -1, 0); aobj = node->tn_reg.tn_aobj; if (aobj) { osize = aobj->size; nsize = vp->v_object->size; if (nsize < osize) { aobj->size = osize; swap_pager_freespace(aobj, nsize, osize - nsize); vm_object_page_remove(aobj, nsize, osize, FALSE); } } } else { vm_object_t aobj; error = nvextendbuf(vp, oldsize, newsize, TMPFS_BLKSIZE, TMPFS_BLKSIZE, -1, -1, trivial); aobj = node->tn_reg.tn_aobj; if (aobj) aobj->size = vp->v_object->size; } out: return error; }
/* * This does the real work of segkp allocation. * Return to client base addr. len must be page-aligned. A null value is * returned if there are no more vm resources (e.g. pages, swap). The len * and base recorded in the private data structure include the redzone * and the redzone length (if applicable). If the user requests a redzone * either the first or last page is left unmapped depending whether stacks * grow to low or high memory. * * The client may also specify a no-wait flag. If that is set then the * request will choose a non-blocking path when requesting resources. * The default is make the client wait. */ static caddr_t segkp_get_internal( struct seg *seg, size_t len, uint_t flags, struct segkp_data **tkpd, struct anon_map *amp) { struct segkp_segdata *kpsd = (struct segkp_segdata *)seg->s_data; struct segkp_data *kpd; caddr_t vbase = NULL; /* always first virtual, may not be mapped */ pgcnt_t np = 0; /* number of pages in the resource */ pgcnt_t segkpindex; long i; caddr_t va; pgcnt_t pages = 0; ulong_t anon_idx = 0; int kmflag = (flags & KPD_NOWAIT) ? KM_NOSLEEP : KM_SLEEP; caddr_t s_base = (segkp_fromheap) ? kvseg.s_base : seg->s_base; if (len & PAGEOFFSET) { panic("segkp_get: len is not page-aligned"); /*NOTREACHED*/ } ASSERT(((flags & KPD_HASAMP) == 0) == (amp == NULL)); /* Only allow KPD_NO_ANON if we are going to lock it down */ if ((flags & (KPD_LOCKED|KPD_NO_ANON)) == KPD_NO_ANON) return (NULL); if ((kpd = kmem_zalloc(sizeof (struct segkp_data), kmflag)) == NULL) return (NULL); /* * Fix up the len to reflect the REDZONE if applicable */ if (flags & KPD_HASREDZONE) len += PAGESIZE; np = btop(len); vbase = vmem_alloc(SEGKP_VMEM(seg), len, kmflag | VM_BESTFIT); if (vbase == NULL) { kmem_free(kpd, sizeof (struct segkp_data)); return (NULL); } /* If locking, reserve physical memory */ if (flags & KPD_LOCKED) { pages = btop(SEGKP_MAPLEN(len, flags)); if (page_resv(pages, kmflag) == 0) { vmem_free(SEGKP_VMEM(seg), vbase, len); kmem_free(kpd, sizeof (struct segkp_data)); return (NULL); } if ((flags & KPD_NO_ANON) == 0) atomic_add_long(&anon_segkp_pages_locked, pages); } /* * Reserve sufficient swap space for this vm resource. We'll * actually allocate it in the loop below, but reserving it * here allows us to back out more gracefully than if we * had an allocation failure in the body of the loop. * * Note that we don't need swap space for the red zone page. */ if (amp != NULL) { /* * The swap reservation has been done, if required, and the * anon_hdr is separate. */ anon_idx = 0; kpd->kp_anon_idx = anon_idx; kpd->kp_anon = amp->ahp; TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u", kpd, vbase, len, flags, 1); } else if ((flags & KPD_NO_ANON) == 0) { if (anon_resv_zone(SEGKP_MAPLEN(len, flags), NULL) == 0) { if (flags & KPD_LOCKED) { atomic_add_long(&anon_segkp_pages_locked, -pages); page_unresv(pages); } vmem_free(SEGKP_VMEM(seg), vbase, len); kmem_free(kpd, sizeof (struct segkp_data)); return (NULL); } atomic_add_long(&anon_segkp_pages_resv, btop(SEGKP_MAPLEN(len, flags))); anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT; kpd->kp_anon_idx = anon_idx; kpd->kp_anon = kpsd->kpsd_anon; TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u", kpd, vbase, len, flags, 1); } else {
/* * The passed-in chain must be locked and the returned inode will also be * locked. This routine typically locates or allocates the inode, assigns * ip->chain (adding a ref to chain if necessary), and returns the inode. * * The hammer2_inode structure regulates the interface between the high level * kernel VNOPS API and the filesystem backend (the chains). * * WARNING! This routine sucks up the chain's lock (makes it part of the * inode lock from the point of view of the inode lock API), * so callers need to be careful. * * WARNING! The mount code is allowed to pass dip == NULL for iroot and * is allowed to pass pmp == NULL and dip == NULL for sroot. */ hammer2_inode_t * hammer2_inode_get(hammer2_pfsmount_t *pmp, hammer2_inode_t *dip, hammer2_chain_t *chain) { hammer2_inode_t *nip; KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INODE); /* * Interlocked lookup/ref of the inode. This code is only needed * when looking up inodes with nlinks != 0 (TODO: optimize out * otherwise and test for duplicates). */ again: for (;;) { nip = hammer2_inode_lookup(pmp, chain->data->ipdata.inum); if (nip == NULL) break; ccms_thread_lock(&nip->topo_cst, CCMS_STATE_EXCLUSIVE); if ((nip->flags & HAMMER2_INODE_ONRBTREE) == 0) { /* race */ ccms_thread_unlock(&nip->topo_cst); hammer2_inode_drop(nip); continue; } if (nip->chain != chain) hammer2_inode_repoint(nip, NULL, chain); /* * Consolidated nip/nip->chain is locked (chain locked * by caller). */ return nip; } /* * We couldn't find the inode number, create a new inode. */ if (pmp) { nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO); atomic_add_long(&pmp->inmem_inodes, 1); hammer2_chain_memory_inc(pmp); hammer2_chain_memory_wakeup(pmp); } else { nip = kmalloc(sizeof(*nip), M_HAMMER2, M_WAITOK | M_ZERO); nip->flags = HAMMER2_INODE_SROOT; } nip->inum = chain->data->ipdata.inum; nip->size = chain->data->ipdata.size; nip->mtime = chain->data->ipdata.mtime; hammer2_inode_repoint(nip, NULL, chain); nip->pip = dip; /* can be NULL */ if (dip) hammer2_inode_ref(dip); /* ref dip for nip->pip */ nip->pmp = pmp; /* * ref and lock on nip gives it state compatible to after a * hammer2_inode_lock_ex() call. */ nip->refs = 1; ccms_cst_init(&nip->topo_cst, &nip->chain); ccms_thread_lock(&nip->topo_cst, CCMS_STATE_EXCLUSIVE); /* combination of thread lock and chain lock == inode lock */ /* * Attempt to add the inode. If it fails we raced another inode * get. Undo all the work and try again. */ if (pmp) { spin_lock(&pmp->inum_spin); if (RB_INSERT(hammer2_inode_tree, &pmp->inum_tree, nip)) { spin_unlock(&pmp->inum_spin); ccms_thread_unlock(&nip->topo_cst); hammer2_inode_drop(nip); goto again; } atomic_set_int(&nip->flags, HAMMER2_INODE_ONRBTREE); spin_unlock(&pmp->inum_spin); } return (nip); }
static vnode_t * make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp, struct vnodeops *vops, int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *), int *newnode, cred_t *cr) { rnode4_t *rp; rnode4_t *trp; vnode_t *vp; mntinfo4_t *mi; ASSERT(RW_READ_HELD(&rhtp->r_lock)); mi = VFTOMI4(vfsp); start: if ((rp = r4find(rhtp, fh, vfsp)) != NULL) { vp = RTOV4(rp); *newnode = 0; return (vp); } rw_exit(&rhtp->r_lock); mutex_enter(&rp4freelist_lock); if (rp4freelist != NULL && rnode4_new >= nrnode) { rp = rp4freelist; rp4_rmfree(rp); mutex_exit(&rp4freelist_lock); vp = RTOV4(rp); if (rp->r_flags & R4HASHED) { rw_enter(&rp->r_hashq->r_lock, RW_WRITER); mutex_enter(&vp->v_lock); if (vp->v_count > 1) { vp->v_count--; mutex_exit(&vp->v_lock); rw_exit(&rp->r_hashq->r_lock); rw_enter(&rhtp->r_lock, RW_READER); goto start; } mutex_exit(&vp->v_lock); rp4_rmhash_locked(rp); rw_exit(&rp->r_hashq->r_lock); } r4inactive(rp, cr); mutex_enter(&vp->v_lock); if (vp->v_count > 1) { vp->v_count--; mutex_exit(&vp->v_lock); rw_enter(&rhtp->r_lock, RW_READER); goto start; } mutex_exit(&vp->v_lock); vn_invalid(vp); /* * destroy old locks before bzero'ing and * recreating the locks below. */ uninit_rnode4(rp); /* * Make sure that if rnode is recycled then * VFS count is decremented properly before * reuse. */ VFS_RELE(vp->v_vfsp); vn_reinit(vp); } else { vnode_t *new_vp; mutex_exit(&rp4freelist_lock); rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP); new_vp = vn_alloc(KM_SLEEP); atomic_add_long((ulong_t *)&rnode4_new, 1); #ifdef DEBUG clstat4_debug.nrnode.value.ui64++; #endif vp = new_vp; } bzero(rp, sizeof (*rp)); rp->r_vnode = vp; nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL); nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL); mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL); rp->created_v4 = 0; list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t), offsetof(nfs4_open_stream_t, os_node)); rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head; rp->r_lo_head.lo_next_rnode = &rp->r_lo_head; cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL); cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL); rp->r_flags = R4READDIRWATTR; rp->r_fh = fh; rp->r_hashq = rhtp; sfh4_hold(rp->r_fh); rp->r_server = mi->mi_curr_serv; rp->r_deleg_type = OPEN_DELEGATE_NONE; rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE; nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL); rddir4_cache_create(rp); rp->r_putapage = putapage; vn_setops(vp, vops); vp->v_data = (caddr_t)rp; vp->v_vfsp = vfsp; VFS_HOLD(vfsp); vp->v_type = VNON; if (isrootfh(fh, rp)) vp->v_flag = VROOT; vn_exists(vp); /* * There is a race condition if someone else * alloc's the rnode while no locks are held, so we * check again and recover if found. */ rw_enter(&rhtp->r_lock, RW_WRITER); if ((trp = r4find(rhtp, fh, vfsp)) != NULL) { vp = RTOV4(trp); *newnode = 0; rw_exit(&rhtp->r_lock); rp4_addfree(rp, cr); rw_enter(&rhtp->r_lock, RW_READER); return (vp); } rp4_addhash(rp); *newnode = 1; return (vp); }
/* * Destroys the node pointed to by node from the file system 'tmp'. * If the node does not belong to the given mount point, the results are * unpredicted. * * If the node references a directory; no entries are allowed because * their removal could need a recursive algorithm, something forbidden in * kernel space. Furthermore, there is not need to provide such * functionality (recursive removal) because the only primitives offered * to the user are the removal of empty directories and the deletion of * individual files. * * Note that nodes are not really deleted; in fact, when a node has been * allocated, it cannot be deleted during the whole life of the file * system. Instead, they are moved to the available list and remain there * until reused. * * A caller must have TMPFS_NODE_LOCK(node) and this function unlocks it. */ void tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) { vm_pindex_t pages = 0; #ifdef INVARIANTS TMPFS_ASSERT_ELOCKED(node); KKASSERT(node->tn_vnode == NULL); KKASSERT((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0); #endif TMPFS_LOCK(tmp); LIST_REMOVE(node, tn_entries); tmp->tm_nodes_inuse--; TMPFS_UNLOCK(tmp); TMPFS_NODE_UNLOCK(node); /* Caller has this lock */ switch (node->tn_type) { case VNON: /* Do not do anything. VNON is provided to let the * allocation routine clean itself easily by avoiding * duplicating code in it. */ /* FALLTHROUGH */ case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ break; case VDIR: /* * The parent link can be NULL if this is the root * node or if it is a directory node that was rmdir'd. * * XXX what if node is a directory which still contains * directory entries (e.g. due to a forced umount) ? */ node->tn_size = 0; KKASSERT(node->tn_dir.tn_parent == NULL); /* * If the root node is being destroyed don't leave a * dangling pointer in tmpfs_mount. */ if (node == tmp->tm_root) tmp->tm_root = NULL; break; case VFIFO: /* FALLTHROUGH */ case VSOCK: break; case VLNK: kfree(node->tn_link, tmp->tm_name_zone); node->tn_link = NULL; node->tn_size = 0; break; case VREG: if (node->tn_reg.tn_aobj != NULL) vm_object_deallocate(node->tn_reg.tn_aobj); node->tn_reg.tn_aobj = NULL; pages = node->tn_reg.tn_aobj_pages; break; default: panic("tmpfs_free_node: type %p %d", node, (int)node->tn_type); } /* * Clean up fields for the next allocation. The objcache only ctors * new allocations. */ tmpfs_node_ctor(node, NULL, 0); objcache_put(tmp->tm_node_pool, node); /* node is now invalid */ if (pages) atomic_add_long(&tmp->tm_pages_used, -(long)pages); }
void balloon_drv_subtracted(int64_t delta) { atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta); }
/* * XXX this API needs a rewrite. It needs to be split into a * hammer2_inode_alloc() and hammer2_inode_build() to allow us to get * rid of the inode/chain lock reversal fudge. * * Returns the inode associated with the passed-in cluster, allocating a new * hammer2_inode structure if necessary, then synchronizing it to the passed * xop cluster. When synchronizing, if idx >= 0, only cluster index (idx) * is synchronized. Otherwise the whole cluster is synchronized. inum will * be extracted from the passed-in xop and the inum argument will be ignored. * * If xop is passed as NULL then a new hammer2_inode is allocated with the * specified inum, and returned. For normal inodes, the inode will be * indexed in memory and if it already exists the existing ip will be * returned instead of allocating a new one. The superroot and PFS inodes * are not indexed in memory. * * The passed-in cluster must be locked and will remain locked on return. * The returned inode will be locked and the caller may dispose of both * via hammer2_inode_unlock() + hammer2_inode_drop(). However, if the caller * needs to resolve a hardlink it must ref/unlock/relock/drop the inode. * * The hammer2_inode structure regulates the interface between the high level * kernel VNOPS API and the filesystem backend (the chains). * * On return the inode is locked with the supplied cluster. */ hammer2_inode_t * hammer2_inode_get(hammer2_pfs_t *pmp, hammer2_xop_head_t *xop, hammer2_tid_t inum, int idx) { hammer2_inode_t *nip; const hammer2_inode_data_t *iptmp; const hammer2_inode_data_t *nipdata; KKASSERT(xop == NULL || hammer2_cluster_type(&xop->cluster) == HAMMER2_BREF_TYPE_INODE); KKASSERT(pmp); /* * Interlocked lookup/ref of the inode. This code is only needed * when looking up inodes with nlinks != 0 (TODO: optimize out * otherwise and test for duplicates). * * Cluster can be NULL during the initial pfs allocation. */ if (xop) { iptmp = &hammer2_xop_gdata(xop)->ipdata; inum = iptmp->meta.inum; hammer2_xop_pdata(xop); } again: nip = hammer2_inode_lookup(pmp, inum); if (nip) { /* * We may have to unhold the cluster to avoid a deadlock * against vnlru (and possibly other XOPs). */ if (xop) { if (hammer2_mtx_ex_try(&nip->lock) != 0) { hammer2_cluster_unhold(&xop->cluster); hammer2_mtx_ex(&nip->lock); hammer2_cluster_rehold(&xop->cluster); } } else { hammer2_mtx_ex(&nip->lock); } /* * Handle SMP race (not applicable to the super-root spmp * which can't index inodes due to duplicative inode numbers). */ if (pmp->spmp_hmp == NULL && (nip->flags & HAMMER2_INODE_ONRBTREE) == 0) { hammer2_mtx_unlock(&nip->lock); hammer2_inode_drop(nip); goto again; } if (xop) { if (idx >= 0) hammer2_inode_repoint_one(nip, &xop->cluster, idx); else hammer2_inode_repoint(nip, NULL, &xop->cluster); } return nip; } /* * We couldn't find the inode number, create a new inode and try to * insert it, handle insertion races. */ nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO); spin_init(&nip->cluster_spin, "h2clspin"); atomic_add_long(&pmp->inmem_inodes, 1); if (pmp->spmp_hmp) nip->flags = HAMMER2_INODE_SROOT; /* * Initialize nip's cluster. A cluster is provided for normal * inodes but typically not for the super-root or PFS inodes. */ nip->cluster.refs = 1; nip->cluster.pmp = pmp; nip->cluster.flags |= HAMMER2_CLUSTER_INODE; if (xop) { nipdata = &hammer2_xop_gdata(xop)->ipdata; nip->meta = nipdata->meta; hammer2_xop_pdata(xop); atomic_set_int(&nip->flags, HAMMER2_INODE_METAGOOD); hammer2_inode_repoint(nip, NULL, &xop->cluster); } else { nip->meta.inum = inum; /* PFS inum is always 1 XXX */ /* mtime will be updated when a cluster is available */ atomic_set_int(&nip->flags, HAMMER2_INODE_METAGOOD); /*XXX*/ } nip->pmp = pmp; /* * ref and lock on nip gives it state compatible to after a * hammer2_inode_lock() call. */ nip->refs = 1; hammer2_mtx_init(&nip->lock, "h2inode"); hammer2_mtx_init(&nip->truncate_lock, "h2trunc"); hammer2_mtx_ex(&nip->lock); TAILQ_INIT(&nip->depend_static.sideq); /* combination of thread lock and chain lock == inode lock */ /* * Attempt to add the inode. If it fails we raced another inode * get. Undo all the work and try again. */ if (pmp->spmp_hmp == NULL) { hammer2_spin_ex(&pmp->inum_spin); if (RB_INSERT(hammer2_inode_tree, &pmp->inum_tree, nip)) { hammer2_spin_unex(&pmp->inum_spin); hammer2_mtx_unlock(&nip->lock); hammer2_inode_drop(nip); goto again; } atomic_set_int(&nip->flags, HAMMER2_INODE_ONRBTREE); ++pmp->inum_count; hammer2_spin_unex(&pmp->inum_spin); } return (nip); }
/* * Drop an inode reference, freeing the inode when the last reference goes * away. */ void hammer2_inode_drop(hammer2_inode_t *ip) { hammer2_pfsmount_t *pmp; hammer2_inode_t *pip; u_int refs; while (ip) { refs = ip->refs; cpu_ccfence(); if (refs == 1) { /* * Transition to zero, must interlock with * the inode inumber lookup tree (if applicable). * * NOTE: The super-root inode has no pmp. */ pmp = ip->pmp; if (pmp) spin_lock(&pmp->inum_spin); if (atomic_cmpset_int(&ip->refs, 1, 0)) { KKASSERT(ip->topo_cst.count == 0); if (ip->flags & HAMMER2_INODE_ONRBTREE) { atomic_clear_int(&ip->flags, HAMMER2_INODE_ONRBTREE); RB_REMOVE(hammer2_inode_tree, &pmp->inum_tree, ip); } if (pmp) spin_unlock(&pmp->inum_spin); pip = ip->pip; ip->pip = NULL; ip->pmp = NULL; /* * Cleaning out ip->chain isn't entirely * trivial. */ hammer2_inode_repoint(ip, NULL, NULL); /* * We have to drop pip (if non-NULL) to * dispose of our implied reference from * ip->pip. We can simply loop on it. */ if (pmp) { KKASSERT((ip->flags & HAMMER2_INODE_SROOT) == 0); kfree(ip, pmp->minode); atomic_add_long(&pmp->inmem_inodes, -1); } else { KKASSERT(ip->flags & HAMMER2_INODE_SROOT); kfree(ip, M_HAMMER2); } ip = pip; /* continue with pip (can be NULL) */ } else { if (pmp) spin_unlock(&ip->pmp->inum_spin); } } else { /* * Non zero transition */ if (atomic_cmpset_int(&ip->refs, refs, refs - 1)) break; } } }
/* * balloon_replace_pages() * Try to replace nextexts blocks of 2^order pages. addr_bits specifies * how many bits of address the pages must be within (i.e. 16 would mean * that the pages cannot have an address > 64k). The constrints are on * what the hypervisor gives us -- we are free to give any pages in * exchange. The array pp is the pages we are giving away. The caller * provides storage space for mfns, which hold the new physical pages. */ long balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits, uint_t order, mfn_t *mfns) { xen_memory_reservation_t memres; long fallback_cnt; long cnt; uint_t i, j, page_cnt, extlen; long e; int locked; /* * we shouldn't be allocating constrained pages on a guest. It doesn't * make any sense. They won't be constrained after a migration. */ ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); extlen = 1 << order; page_cnt = nextents * extlen; /* Give back the current pages to the hypervisor */ for (i = 0; i < page_cnt; i++) { cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum); if (cnt != 1) { cmn_err(CE_PANIC, "balloon: unable to give a page back " "to the hypervisor.\n"); } } /* * try to allocate the new pages using addr_bits and order. If we can't * get all of the pages, try to get the remaining pages with no * constraints and, if that was successful, return the number of * constrained pages we did allocate. */ bzero(&memres, sizeof (memres)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memres.extent_start, mfns); memres.domid = DOMID_SELF; memres.nr_extents = nextents; memres.mem_flags = XENMEMF_address_bits(addr_bits); memres.extent_order = order; cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); /* assign the new MFNs to the current PFNs */ locked = balloon_lock_contig_pfnlist(cnt * extlen); for (i = 0; i < cnt; i++) { for (j = 0; j < extlen; j++) { reassign_pfn(pp[i * extlen + j]->p_pagenum, mfns[i] + j); } } if (locked) unlock_contig_pfnlist(); if (cnt != nextents) { if (cnt < 0) { cnt = 0; } /* * We couldn't get enough memory to satisfy our requirements. * The above loop will assign the parts of the request that * were successful (this part may be 0). We need to fill * in the rest. The bzero below clears out extent_order and * address_bits, so we'll take anything from the hypervisor * to replace the pages we gave away. */ fallback_cnt = page_cnt - cnt * extlen; bzero(&memres, sizeof (memres)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memres.extent_start, mfns); memres.domid = DOMID_SELF; memres.nr_extents = fallback_cnt; e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); if (e != fallback_cnt) { cmn_err(CE_PANIC, "balloon: unable to recover from " "failed increase_reservation.\n"); } locked = balloon_lock_contig_pfnlist(fallback_cnt); for (i = 0; i < fallback_cnt; i++) { uint_t offset = page_cnt - fallback_cnt; /* * We already used pp[0...(cnt * extlen)] before, * so start at the next entry in the pp array. */ reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]); } if (locked) unlock_contig_pfnlist(); } /* * balloon_free_pages increments our counter. Decrement it here. */ atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt); /* * return the number of extents we were able to replace. If we got * this far, we know all the pp's are valid. */ return (cnt); }
int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { const struct xsave_limits *limits; uint64_t cr4; int error, enable_invpcid; unsigned int func, regs[4]; enum x2apic_state x2apic_state; /* * Requests for invalid CPUID levels should map to the highest * available level instead. */ if (cpu_exthigh != 0 && *eax >= 0x80000000) { if (*eax > cpu_exthigh) *eax = cpu_exthigh; } else if (*eax >= 0x40000000) { if (*eax > CPUID_VM_HIGH) *eax = CPUID_VM_HIGH; } else if (*eax > cpu_high) { *eax = cpu_high; } func = *eax; /* * In general the approach used for CPU topology is to * advertise a flat topology where all CPUs are packages with * no multi-core or SMT. */ switch (func) { /* * Pass these through to the guest */ case CPUID_0000_0000: case CPUID_0000_0002: case CPUID_0000_0003: case CPUID_8000_0000: case CPUID_8000_0002: case CPUID_8000_0003: case CPUID_8000_0004: case CPUID_8000_0006: case CPUID_8000_0008: cpuid_count(*eax, *ecx, regs); break; case CPUID_8000_0001: /* * Hide rdtscp/ia32_tsc_aux until we know how * to deal with them. */ cpuid_count(*eax, *ecx, regs); regs[3] &= ~AMDID_RDTSCP; break; case CPUID_8000_0007: cpuid_count(*eax, *ecx, regs); /* * If the host TSCs are not synchronized across * physical cpus then we cannot advertise an * invariant tsc to a vcpu. * * XXX This still falls short because the vcpu * can observe the TSC moving backwards as it * migrates across physical cpus. But at least * it should discourage the guest from using the * TSC to keep track of time. */ if (!smp_tsc) regs[3] &= ~AMDPM_TSC_INVARIANT; break; case CPUID_0000_0001: do_cpuid(1, regs); error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state); if (error) { panic("x86_emulate_cpuid: error %d " "fetching x2apic state", error); } /* * Override the APIC ID only in ebx */ regs[1] &= ~(CPUID_LOCAL_APIC_ID); regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); /* * Don't expose VMX, SpeedStep or TME capability. * Advertise x2APIC capability and Hypervisor guest. */ regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); regs[2] |= CPUID2_HV; if (x2apic_state != X2APIC_DISABLED) regs[2] |= CPUID2_X2APIC; else regs[2] &= ~CPUID2_X2APIC; /* * Only advertise CPUID2_XSAVE in the guest if * the host is using XSAVE. */ if (!(regs[2] & CPUID2_OSXSAVE)) regs[2] &= ~CPUID2_XSAVE; /* * If CPUID2_XSAVE is being advertised and the * guest has set CR4_XSAVE, set * CPUID2_OSXSAVE. */ regs[2] &= ~CPUID2_OSXSAVE; if (regs[2] & CPUID2_XSAVE) { error = vm_get_register(vm, vcpu_id, VM_REG_GUEST_CR4, &cr4); if (error) panic("x86_emulate_cpuid: error %d " "fetching %%cr4", error); if (cr4 & CR4_XSAVE) regs[2] |= CPUID2_OSXSAVE; } /* * Hide monitor/mwait until we know how to deal with * these instructions. */ regs[2] &= ~CPUID2_MON; /* * Hide the performance and debug features. */ regs[2] &= ~CPUID2_PDCM; /* * No TSC deadline support in the APIC yet */ regs[2] &= ~CPUID2_TSCDLT; /* * Hide thermal monitoring */ regs[3] &= ~(CPUID_ACPI | CPUID_TM); /* * Machine check handling is done in the host. * Hide MTRR capability. */ regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR); /* * Hide the debug store capability. */ regs[3] &= ~CPUID_DS; /* * Disable multi-core. */ regs[1] &= ~CPUID_HTT_CORES; regs[3] &= ~CPUID_HTT; break; case CPUID_0000_0004: do_cpuid(4, regs); /* * Do not expose topology. */ regs[0] &= 0xffff8000; regs[0] |= 0x04008000; break; case CPUID_0000_0007: regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; /* leaf 0 */ if (*ecx == 0) { error = vm_get_capability(vm, vcpu_id, VM_CAP_ENABLE_INVPCID, &enable_invpcid); if (error == 0 && enable_invpcid) regs[1] |= CPUID_STDEXT_INVPCID; } break; case CPUID_0000_0006: case CPUID_0000_000A: /* * Handle the access, but report 0 for * all options */ regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; break; case CPUID_0000_000B: /* * Processor topology enumeration */ regs[0] = 0; regs[1] = 0; regs[2] = *ecx & 0xff; regs[3] = vcpu_id; break; case CPUID_0000_000D: limits = vmm_get_xsave_limits(); if (!limits->xsave_enabled) { regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; break; } cpuid_count(*eax, *ecx, regs); switch (*ecx) { case 0: /* * Only permit the guest to use bits * that are active in the host in * %xcr0. Also, claim that the * maximum save area size is * equivalent to the host's current * save area size. Since this runs * "inside" of vmrun(), it runs with * the guest's xcr0, so the current * save area size is correct as-is. */ regs[0] &= limits->xcr0_allowed; regs[2] = limits->xsave_max_size; regs[3] &= (limits->xcr0_allowed >> 32); break; case 1: /* Only permit XSAVEOPT. */ regs[0] &= CPUID_EXTSTATE_XSAVEOPT; regs[1] = 0; regs[2] = 0; regs[3] = 0; break; default: /* * If the leaf is for a permitted feature, * pass through as-is, otherwise return * all zeroes. */ if (!(limits->xcr0_allowed & (1ul << *ecx))) { regs[0] = 0; regs[1] = 0; regs[2] = 0; regs[3] = 0; } break; } break; case 0x40000000: regs[0] = CPUID_VM_HIGH; bcopy(bhyve_id, ®s[1], 4); bcopy(bhyve_id + 4, ®s[2], 4); bcopy(bhyve_id + 8, ®s[3], 4); break; default: /* * The leaf value has already been clamped so * simply pass this through, keeping count of * how many unhandled leaf values have been seen. */ atomic_add_long(&bhyve_xcpuids, 1); cpuid_count(*eax, *ecx, regs); break; } *eax = regs[0]; *ebx = regs[1]; *ecx = regs[2]; *edx = regs[3]; return (1); }
int px_fdvma_reserve(dev_info_t *dip, dev_info_t *rdip, px_t *px_p, ddi_dma_req_t *dmareq, ddi_dma_handle_t *handlep) { fdvma_t *fdvma_p; px_dvma_addr_t dvma_pg; px_mmu_t *mmu_p = px_p->px_mmu_p; size_t npages; ddi_dma_impl_t *mp; ddi_dma_lim_t *lim_p = dmareq->dmar_limits; ulong_t hi = lim_p->dlim_addr_hi; ulong_t lo = lim_p->dlim_addr_lo; size_t counter_max = (lim_p->dlim_cntr_max + 1) & MMU_PAGE_MASK; if (px_disable_fdvma) return (DDI_FAILURE); DBG(DBG_DMA_CTL, dip, "DDI_DMA_RESERVE: rdip=%s%d\n", ddi_driver_name(rdip), ddi_get_instance(rdip)); /* * Check the limit structure. */ if ((lo >= hi) || (hi < mmu_p->mmu_dvma_base)) return (DDI_DMA_BADLIMITS); /* * Allocate DVMA space from reserve. */ npages = dmareq->dmar_object.dmao_size; if ((long)atomic_add_long_nv(&mmu_p->mmu_dvma_reserve, -npages) < 0) { atomic_add_long(&mmu_p->mmu_dvma_reserve, npages); return (DDI_DMA_NORESOURCES); } /* * Allocate the dma handle. */ mp = kmem_zalloc(sizeof (px_dma_hdl_t), KM_SLEEP); /* * Get entries from dvma space map. * (vmem_t *vmp, * size_t size, size_t align, size_t phase, * size_t nocross, void *minaddr, void *maxaddr, int vmflag) */ dvma_pg = MMU_BTOP((ulong_t)vmem_xalloc(mmu_p->mmu_dvma_map, MMU_PTOB(npages), MMU_PAGE_SIZE, 0, counter_max, (void *)lo, (void *)(hi + 1), dmareq->dmar_fp == DDI_DMA_SLEEP ? VM_SLEEP : VM_NOSLEEP)); if (dvma_pg == 0) { atomic_add_long(&mmu_p->mmu_dvma_reserve, npages); kmem_free(mp, sizeof (px_dma_hdl_t)); return (DDI_DMA_NOMAPPING); } /* * Create the fast dvma request structure. */ fdvma_p = kmem_alloc(sizeof (fdvma_t), KM_SLEEP); fdvma_p->pagecnt = kmem_alloc(npages * sizeof (uint_t), KM_SLEEP); fdvma_p->ops = &fdvma_ops; fdvma_p->softsp = (caddr_t)px_p; fdvma_p->sync_flag = NULL; /* * Initialize the handle. */ mp->dmai_rdip = rdip; mp->dmai_rflags = DMP_BYPASSNEXUS | DDI_DMA_READ | DMP_NOSYNC; mp->dmai_burstsizes = dmareq->dmar_limits->dlim_burstsizes; mp->dmai_mapping = MMU_PTOB(dvma_pg); mp->dmai_ndvmapages = npages; mp->dmai_size = npages * MMU_PAGE_SIZE; mp->dmai_nwin = 0; mp->dmai_fdvma = (caddr_t)fdvma_p; /* * The bdf protection value is set to immediate child * at first. It gets modified by switch/bridge drivers * as the code traverses down the fabric topology. * * XXX No IOMMU protection for broken devices. */ ASSERT((intptr_t)ddi_get_parent_data(rdip) >> 1 == 0); mp->dmai_bdf = ((intptr_t)ddi_get_parent_data(rdip) == 1) ? PCIE_INVALID_BDF : pcie_get_bdf_for_dma_xfer(dip, rdip); DBG(DBG_DMA_CTL, dip, "DDI_DMA_RESERVE: mp=%p dvma=%x npages=%x private=%p\n", mp, mp->dmai_mapping, npages, fdvma_p); *handlep = (ddi_dma_handle_t)mp; return (DDI_SUCCESS); }
/* * balloon_free_pages() * free page_cnt pages, using any combination of mfns, pfns, and kva as long * as they refer to the same mapping. If an array of mfns is passed in, we * assume they were already cleared. Otherwise, we need to zero the pages * before giving them back to the hypervisor. kva space is not free'd up in * case the caller wants to re-use it. */ long balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns) { xen_memory_reservation_t memdec; mfn_t mfn; pfn_t pfn; uint_t i; long e; #if DEBUG /* make sure kva is page aligned and maps to first pfn */ if (kva != NULL) { ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0); if (pfns != NULL) { ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]); } } #endif /* if we have a kva, we can clean all pages with just one bzero */ if ((kva != NULL) && balloon_zero_memory) { bzero(kva, (page_cnt * PAGESIZE)); } /* if we were given a kva and/or a pfn */ if ((kva != NULL) || (pfns != NULL)) { /* * All the current callers only pass 1 page when using kva or * pfns, and use mfns when passing multiple pages. If that * assumption is changed, the following code will need some * work. The following ASSERT() guarantees we're respecting * the io locking quota. */ ASSERT(page_cnt < bln_contig_list_quota); /* go through all the pages */ for (i = 0; i < page_cnt; i++) { /* get the next pfn */ if (pfns == NULL) { pfn = hat_getpfnum(kas.a_hat, (kva + (PAGESIZE * i))); } else { pfn = pfns[i]; } /* * if we didn't already zero this page, do it now. we * need to do this *before* we give back the MFN */ if ((kva == NULL) && (balloon_zero_memory)) { pfnzero(pfn, 0, PAGESIZE); } /* * unmap the pfn. We don't free up the kva vmem space * so the caller can re-use it. The page must be * unmapped before it is given back to the hypervisor. */ if (kva != NULL) { hat_unload(kas.a_hat, (kva + (PAGESIZE * i)), PAGESIZE, HAT_UNLOAD_UNMAP); } /* grab the mfn before the pfn is marked as invalid */ mfn = pfn_to_mfn(pfn); /* mark the pfn as invalid */ reassign_pfn(pfn, MFN_INVALID); /* * if we weren't given an array of MFNs, we need to * free them up one at a time. Otherwise, we'll wait * until later and do it in one hypercall */ if (mfns == NULL) { bzero(&memdec, sizeof (memdec)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memdec.extent_start, &mfn); memdec.domid = DOMID_SELF; memdec.nr_extents = 1; e = HYPERVISOR_memory_op( XENMEM_decrease_reservation, &memdec); if (e != 1) { cmn_err(CE_PANIC, "balloon: unable to " "give a page back to the " "hypervisor.\n"); } } } } /* * if we were passed in MFNs, we haven't free'd them up yet. We can * do it with one call. */ if (mfns != NULL) { bzero(&memdec, sizeof (memdec)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memdec.extent_start, mfns); memdec.domid = DOMID_SELF; memdec.nr_extents = page_cnt; e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec); if (e != page_cnt) { cmn_err(CE_PANIC, "balloon: unable to give pages back " "to the hypervisor.\n"); } } atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt); return (page_cnt); }
/* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_GETPAGES. */ int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int bytecount, int reqpage, vop_getpages_iodone_t iodone, void *arg) { vm_object_t object; struct bufobj *bo; struct buf *bp; daddr_t firstaddr, reqblock; off_t foff, pib; int pbefore, pafter, i, size, bsize, first, last, *freecnt; int count, error, before, after, secmask; KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("vnode_pager_generic_getpages does not support devices")); if (vp->v_iflag & VI_DOOMED) return (VM_PAGER_BAD); object = vp->v_object; count = bytecount / PAGE_SIZE; bsize = vp->v_mount->mnt_stat.f_iosize; /* * Synchronous and asynchronous paging operations use different * free pbuf counters. This is done to avoid asynchronous requests * to consume all pbufs. * Allocate the pbuf at the very beginning of the function, so that * if we are low on certain kind of pbufs don't even proceed to BMAP, * but sleep. */ freecnt = iodone != NULL ? &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt; bp = getpbuf(freecnt); /* * Get the underlying device blocks for the file with VOP_BMAP(). * If the file system doesn't support VOP_BMAP, use old way of * getting pages via VOP_READ. */ error = VOP_BMAP(vp, IDX_TO_OFF(m[reqpage]->pindex) / bsize, &bo, &reqblock, &after, &before); if (error == EOPNOTSUPP) { relpbuf(bp, freecnt); VM_OBJECT_WLOCK(object); for (i = 0; i < count; i++) if (i != reqpage) { vm_page_lock(m[i]); vm_page_free(m[i]); vm_page_unlock(m[i]); } PCPU_INC(cnt.v_vnodein); PCPU_INC(cnt.v_vnodepgsin); error = vnode_pager_input_old(object, m[reqpage]); VM_OBJECT_WUNLOCK(object); return (error); } else if (error != 0) { relpbuf(bp, freecnt); vm_pager_free_nonreq(object, m, reqpage, count, FALSE); return (VM_PAGER_ERROR); /* * If the blocksize is smaller than a page size, then use * special small filesystem code. */ } else if ((PAGE_SIZE / bsize) > 1) { relpbuf(bp, freecnt); vm_pager_free_nonreq(object, m, reqpage, count, FALSE); PCPU_INC(cnt.v_vnodein); PCPU_INC(cnt.v_vnodepgsin); return (vnode_pager_input_smlfs(object, m[reqpage])); } /* * Since the caller has busied the requested page, that page's valid * field will not be changed by other threads. */ vm_page_assert_xbusied(m[reqpage]); /* * If we have a completely valid page available to us, we can * clean up and return. Otherwise we have to re-read the * media. */ if (m[reqpage]->valid == VM_PAGE_BITS_ALL) { relpbuf(bp, freecnt); vm_pager_free_nonreq(object, m, reqpage, count, FALSE); return (VM_PAGER_OK); } else if (reqblock == -1) { relpbuf(bp, freecnt); pmap_zero_page(m[reqpage]); KASSERT(m[reqpage]->dirty == 0, ("vnode_pager_generic_getpages: page %p is dirty", m)); VM_OBJECT_WLOCK(object); m[reqpage]->valid = VM_PAGE_BITS_ALL; vm_pager_free_nonreq(object, m, reqpage, count, TRUE); VM_OBJECT_WUNLOCK(object); return (VM_PAGER_OK); } else if (m[reqpage]->valid != 0) { VM_OBJECT_WLOCK(object); m[reqpage]->valid = 0; VM_OBJECT_WUNLOCK(object); } pib = IDX_TO_OFF(m[reqpage]->pindex) % bsize; pbefore = ((daddr_t)before * bsize + pib) / PAGE_SIZE; pafter = ((daddr_t)(after + 1) * bsize - pib) / PAGE_SIZE - 1; first = reqpage < pbefore ? 0 : reqpage - pbefore; last = reqpage + pafter >= count ? count - 1 : reqpage + pafter; if (first > 0 || last + 1 < count) { VM_OBJECT_WLOCK(object); for (i = 0; i < first; i++) { vm_page_lock(m[i]); vm_page_free(m[i]); vm_page_unlock(m[i]); } for (i = last + 1; i < count; i++) { vm_page_lock(m[i]); vm_page_free(m[i]); vm_page_unlock(m[i]); } VM_OBJECT_WUNLOCK(object); } /* * here on direct device I/O */ firstaddr = reqblock; firstaddr += pib / DEV_BSIZE; firstaddr -= IDX_TO_OFF(reqpage - first) / DEV_BSIZE; /* * The first and last page have been calculated now, move * input pages to be zero based, and adjust the count. */ m += first; reqpage -= first; count = last - first + 1; /* * calculate the file virtual address for the transfer */ foff = IDX_TO_OFF(m[0]->pindex); /* * calculate the size of the transfer */ size = count * PAGE_SIZE; KASSERT(count > 0, ("zero count")); if ((foff + size) > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - foff; KASSERT(size > 0, ("zero size")); /* * round up physical size for real devices. */ secmask = bo->bo_bsize - 1; KASSERT(secmask < PAGE_SIZE && secmask > 0, ("vnode_pager_generic_getpages: sector size %d too large", secmask + 1)); size = (size + secmask) & ~secmask; /* * and map the pages to be read into the kva, if the filesystem * requires mapped buffers. */ if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, m, count); } /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); bp->b_blkno = firstaddr; pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = size; bp->b_bufsize = size; bp->b_runningbufspace = bp->b_bufsize; for (i = 0; i < count; i++) bp->b_pages[i] = m[i]; bp->b_npages = count; bp->b_pager.pg_reqpage = reqpage; atomic_add_long(&runningbufspace, bp->b_runningbufspace); PCPU_INC(cnt.v_vnodein); PCPU_ADD(cnt.v_vnodepgsin, count); /* do the input */ bp->b_iooffset = dbtob(bp->b_blkno); if (iodone != NULL) { /* async */ bp->b_pager.pg_iodone = iodone; bp->b_caller1 = arg; bp->b_iodone = vnode_pager_generic_getpages_done_async; bp->b_flags |= B_ASYNC; BUF_KERNPROC(bp); bstrategy(bp); /* Good bye! */ } else { bp->b_iodone = bdone; bstrategy(bp); bwait(bp, PVM, "vnread"); error = vnode_pager_generic_getpages_done(bp); for (i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_pbuf_freecnt); } return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); }