Beispiel #1
0
/*
 * Set redzones and remember allocation backtrace.
 */
void *
redzone_setup(caddr_t raddr, u_long nsize)
{
	struct stack st;
	caddr_t haddr, faddr;

	atomic_add_long(&redzone_extra_mem, redzone_size_ntor(nsize) - nsize);

	haddr = raddr + redzone_roundup(nsize) - REDZONE_HSIZE;
	faddr = haddr + REDZONE_HSIZE + nsize;

	/* Redzone header. */
	stack_save(&st);
	bcopy(&st, haddr, sizeof(st));
	haddr += sizeof(st);
	bcopy(&nsize, haddr, sizeof(nsize));
	haddr += sizeof(nsize);
	memset(haddr, 0x42, REDZONE_CHSIZE);
	haddr += REDZONE_CHSIZE;

	/* Redzone footer. */
	memset(faddr, 0x42, REDZONE_CFSIZE);

	return (haddr);
}
Beispiel #2
0
/*
 * Drop an inode reference, freeing the inode when the last reference goes
 * away.
 */
void
hammer2_inode_drop(hammer2_inode_t *ip)
{
	hammer2_pfs_t *pmp;
	u_int refs;

	while (ip) {
		if (hammer2_debug & 0x80000) {
			kprintf("INODE-1 %p (%d->%d)\n",
				ip, ip->refs, ip->refs - 1);
			print_backtrace(8);
		}
		refs = ip->refs;
		cpu_ccfence();
		if (refs == 1) {
			/*
			 * Transition to zero, must interlock with
			 * the inode inumber lookup tree (if applicable).
			 * It should not be possible for anyone to race
			 * the transition to 0.
			 */
			pmp = ip->pmp;
			KKASSERT(pmp);
			hammer2_spin_ex(&pmp->inum_spin);

			if (atomic_cmpset_int(&ip->refs, 1, 0)) {
				KKASSERT(hammer2_mtx_refs(&ip->lock) == 0);
				if (ip->flags & HAMMER2_INODE_ONRBTREE) {
					atomic_clear_int(&ip->flags,
						     HAMMER2_INODE_ONRBTREE);
					RB_REMOVE(hammer2_inode_tree,
						  &pmp->inum_tree, ip);
					--pmp->inum_count;
				}
				hammer2_spin_unex(&pmp->inum_spin);

				ip->pmp = NULL;

				/*
				 * Cleaning out ip->cluster isn't entirely
				 * trivial.
				 */
				hammer2_inode_repoint(ip, NULL, NULL);

				kfree(ip, pmp->minode);
				atomic_add_long(&pmp->inmem_inodes, -1);
				ip = NULL;	/* will terminate loop */
			} else {
				hammer2_spin_unex(&ip->pmp->inum_spin);
			}
		} else {
			/*
			 * Non zero transition
			 */
			if (atomic_cmpset_int(&ip->refs, refs, refs - 1))
				break;
		}
	}
}
Beispiel #3
0
static
void
loopdebug(const char *msg, pmap_inval_info_t *info)
{
    int p;
    int cpu = mycpu->gd_cpuid;

    /*
     * Don't kprintf() anything if the pmap inval watchdog gets hit.
     * DRM can cause an occassional watchdog hit (at least with a 1/16
     * second watchdog), and attempting to kprintf to the KVM frame buffer
     * from Xinvltlb, which ignores critical sections, can implode the
     * system.
     */
    if (pmap_inval_watchdog_print == 0)
        return;

    cpu_lfence();
#ifdef LOOPRECOVER
    atomic_add_long(&smp_smurf_mask.ary[0], 0);
#endif
    kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx "
#ifdef LOOPRECOVER
            "s=%08jx "
#endif
#ifdef LOOPMASK_IN
            "in=%08jx "
#endif
#ifdef LOOPRECOVER
            "smurf=%08jx\n"
#endif
            , msg, cpu, info->mode,
            info->mask.ary[0],
            info->done.ary[0]
#ifdef LOOPRECOVER
            , info->sigmask.ary[0]
#endif
#ifdef LOOPMASK_IN
            , smp_in_mask.ary[0]
#endif
#ifdef LOOPRECOVER
            , smp_smurf_mask.ary[0]
#endif
           );
    kprintf("mdglob ");
    for (p = 0; p < ncpus; ++p)
        kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb);
    kprintf("\n");
}
Beispiel #4
0
int
chgsbsize(struct uidinfo *uip, u_long *hiwat, u_long to, rlim_t xmax)
{
	rlim_t nsb;
	const long diff = to - *hiwat;

	nsb = (rlim_t)atomic_add_long_nv((long *)&uip->ui_sbsize, diff);
	if (diff > 0 && nsb > xmax) {
		atomic_add_long((long *)&uip->ui_sbsize, -diff);
		return 0;
	}
	*hiwat = to;
	KASSERT(nsb >= 0);
	return 1;
}
Beispiel #5
0
/*
 * balloon_alloc_pages()
 *	Allocate page_cnt mfns.  mfns storage provided by the caller.  Returns
 *	the number of pages allocated, which could be less than page_cnt, or
 *	a negative number if an error occurred.
 */
long
balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns)
{
	xen_memory_reservation_t memres;
	long rv;

	bzero(&memres, sizeof (memres));
	/*LINTED: constant in conditional context*/
	set_xen_guest_handle(memres.extent_start, mfns);
	memres.domid = DOMID_SELF;
	memres.nr_extents = page_cnt;

	rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
	if (rv > 0)
		atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv);
	return (rv);
}
Beispiel #6
0
static void
intr_stray_handler(void *cookie)
{
	struct	intr_handler *ih;

	ih = (struct intr_handler *)cookie;

	if (intr_stray_count[ih->ih_irq] < MAX_STRAY_LOG) {
		printf("stray irq %d\n", ih->ih_irq);

		atomic_add_long(&intr_stray_count[ih->ih_irq], 1);

		if (intr_stray_count[ih->ih_irq] >= MAX_STRAY_LOG)
			printf("got %d stray irq %d's: not logging anymore\n",
			    MAX_STRAY_LOG, ih->ih_irq);
	}
}
/*
 * Allocate zeroed memory if tmpfs_maxkmem has not been exceeded
 * or the 'musthave' flag is set.  'musthave' allocations should
 * always be subordinate to normal allocations so that tmpfs_maxkmem
 * can't be exceeded by more than a few KB.  Example: when creating
 * a new directory, the tmpnode is a normal allocation; if that
 * succeeds, the dirents for "." and ".." are 'musthave' allocations.
 */
void *
tmp_memalloc(size_t size, int musthave)
{
	static time_t last_warning;
	time_t now;

	if (atomic_add_long_nv(&tmp_kmemspace, size) < tmpfs_maxkmem ||
	    musthave)
		return (kmem_zalloc(size, KM_SLEEP));

	atomic_add_long(&tmp_kmemspace, -size);
	now = gethrestime_sec();
	if (last_warning != now) {
		last_warning = now;
		cmn_err(CE_WARN, "tmp_memalloc: tmpfs over memory limit");
	}
	return (NULL);
}
Beispiel #8
0
int
px_fdvma_release(dev_info_t *dip, px_t *px_p, ddi_dma_impl_t *mp)
{
	px_mmu_t *mmu_p = px_p->px_mmu_p;
	size_t npages;
	fdvma_t *fdvma_p = (fdvma_t *)mp->dmai_fdvma;

	if (px_disable_fdvma)
		return (DDI_FAILURE);

	/* validate fdvma handle */
	if (!(mp->dmai_rflags & DMP_BYPASSNEXUS)) {
		DBG(DBG_DMA_CTL, dip, "DDI_DMA_RELEASE: not fast dma\n");
		return (DDI_FAILURE);
	}

	/* flush all reserved dvma addresses from mmu */
	px_mmu_unmap_window(mmu_p, mp);

	npages = mp->dmai_ndvmapages;
	vmem_xfree(mmu_p->mmu_dvma_map, (void *)mp->dmai_mapping,
	    MMU_PTOB(npages));

	atomic_add_long(&mmu_p->mmu_dvma_reserve, npages);
	mp->dmai_ndvmapages = 0;

	/* see if there is anyone waiting for dvma space */
	if (mmu_p->mmu_dvma_clid != 0) {
		DBG(DBG_DMA_CTL, dip, "run dvma callback\n");
		ddi_run_callback(&mmu_p->mmu_dvma_clid);
	}

	/* free data structures */
	kmem_free(fdvma_p->pagecnt, npages * sizeof (uint_t));
	kmem_free(fdvma_p, sizeof (fdvma_t));
	kmem_free(mp, sizeof (px_dma_hdl_t));

	/* see if there is anyone waiting for kmem */
	if (px_kmem_clid != 0) {
		DBG(DBG_DMA_CTL, dip, "run handle callback\n");
		ddi_run_callback(&px_kmem_clid);
	}
	return (DDI_SUCCESS);
}
Beispiel #9
0
struct cdevsw *
devvn_refthread(struct vnode *vp, struct cdev **devp, int *ref)
{
	struct cdevsw *csw;
	struct cdev_priv *cdp;
	struct cdev *dev;

	mtx_assert(&devmtx, MA_NOTOWNED);
	if ((vp->v_vflag & VV_ETERNALDEV) != 0) {
		dev = vp->v_rdev;
		if (dev == NULL)
			return (NULL);
		KASSERT((dev->si_flags & SI_ETERNAL) != 0,
		    ("Not eternal cdev"));
		*ref = 0;
		csw = dev->si_devsw;
		KASSERT(csw != NULL, ("Eternal cdev is destroyed"));
		*devp = dev;
		return (csw);
	}

	csw = NULL;
	dev_lock();
	dev = vp->v_rdev;
	if (dev == NULL) {
		dev_unlock();
		return (NULL);
	}
	cdp = cdev2priv(dev);
	if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0) {
		csw = dev->si_devsw;
		if (csw != NULL)
			atomic_add_long(&dev->si_threadcount, 1);
	}
	dev_unlock();
	if (csw != NULL) {
		*devp = dev;
		*ref = 1;
	}
	return (csw);
}
/*
 * This routine destroys all the resources of an rnode
 * and finally the rnode itself.
 */
static void
destroy_rnode4(rnode4_t *rp)
{
	vnode_t *vp;
	vfs_t *vfsp;

	ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE);

	vp = RTOV4(rp);
	vfsp = vp->v_vfsp;

	uninit_rnode4(rp);
	atomic_add_long((ulong_t *)&rnode4_new, -1);
#ifdef DEBUG
	clstat4_debug.nrnode.value.ui64--;
#endif
	kmem_cache_free(rnode4_cache, rp);
	vn_invalid(vp);
	vn_free(vp);
	VFS_RELE(vfsp);
}
Beispiel #11
0
struct cdevsw *
dev_refthread(struct cdev *dev, int *ref)
{
	struct cdevsw *csw;
	struct cdev_priv *cdp;

	mtx_assert(&devmtx, MA_NOTOWNED);
	if ((dev->si_flags & SI_ETERNAL) != 0) {
		*ref = 0;
		return (dev->si_devsw);
	}
	dev_lock();
	csw = dev->si_devsw;
	if (csw != NULL) {
		cdp = cdev2priv(dev);
		if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0)
			atomic_add_long(&dev->si_threadcount, 1);
		else
			csw = NULL;
	}
	dev_unlock();
	*ref = 1;
	return (csw);
}
Beispiel #12
0
/*
 * This is now called from local media FS's to operate against their
 * own vnodes if they fail to implement VOP_GETPAGES.
 */
int
vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count,
    int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg)
{
	vm_object_t object;
	struct bufobj *bo;
	struct buf *bp;
	off_t foff;
#ifdef INVARIANTS
	off_t blkno0;
#endif
	int bsize, pagesperblock, *freecnt;
	int error, before, after, rbehind, rahead, poff, i;
	int bytecount, secmask;

	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
	    ("%s does not support devices", __func__));

	if (vp->v_iflag & VI_DOOMED)
		return (VM_PAGER_BAD);

	object = vp->v_object;
	foff = IDX_TO_OFF(m[0]->pindex);
	bsize = vp->v_mount->mnt_stat.f_iosize;
	pagesperblock = bsize / PAGE_SIZE;

	KASSERT(foff < object->un_pager.vnp.vnp_size,
	    ("%s: page %p offset beyond vp %p size", __func__, m[0], vp));
	KASSERT(count <= sizeof(bp->b_pages),
	    ("%s: requested %d pages", __func__, count));

	/*
	 * The last page has valid blocks.  Invalid part can only
	 * exist at the end of file, and the page is made fully valid
	 * by zeroing in vm_pager_get_pages().
	 */
	if (m[count - 1]->valid != 0 && --count == 0) {
		if (iodone != NULL)
			iodone(arg, m, 1, 0);
		return (VM_PAGER_OK);
	}

	/*
	 * Synchronous and asynchronous paging operations use different
	 * free pbuf counters.  This is done to avoid asynchronous requests
	 * to consume all pbufs.
	 * Allocate the pbuf at the very beginning of the function, so that
	 * if we are low on certain kind of pbufs don't even proceed to BMAP,
	 * but sleep.
	 */
	freecnt = iodone != NULL ?
	    &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt;
	bp = getpbuf(freecnt);

	/*
	 * Get the underlying device blocks for the file with VOP_BMAP().
	 * If the file system doesn't support VOP_BMAP, use old way of
	 * getting pages via VOP_READ.
	 */
	error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before);
	if (error == EOPNOTSUPP) {
		relpbuf(bp, freecnt);
		VM_OBJECT_WLOCK(object);
		for (i = 0; i < count; i++) {
			PCPU_INC(cnt.v_vnodein);
			PCPU_INC(cnt.v_vnodepgsin);
			error = vnode_pager_input_old(object, m[i]);
			if (error)
				break;
		}
		VM_OBJECT_WUNLOCK(object);
		return (error);
	} else if (error != 0) {
		relpbuf(bp, freecnt);
		return (VM_PAGER_ERROR);
	}

	/*
	 * If the file system supports BMAP, but blocksize is smaller
	 * than a page size, then use special small filesystem code.
	 */
	if (pagesperblock == 0) {
		relpbuf(bp, freecnt);
		for (i = 0; i < count; i++) {
			PCPU_INC(cnt.v_vnodein);
			PCPU_INC(cnt.v_vnodepgsin);
			error = vnode_pager_input_smlfs(object, m[i]);
			if (error)
				break;
		}
		return (error);
	}

	/*
	 * A sparse file can be encountered only for a single page request,
	 * which may not be preceded by call to vm_pager_haspage().
	 */
	if (bp->b_blkno == -1) {
		KASSERT(count == 1,
		    ("%s: array[%d] request to a sparse file %p", __func__,
		    count, vp));
		relpbuf(bp, freecnt);
		pmap_zero_page(m[0]);
		KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty",
		    __func__, m[0]));
		VM_OBJECT_WLOCK(object);
		m[0]->valid = VM_PAGE_BITS_ALL;
		VM_OBJECT_WUNLOCK(object);
		return (VM_PAGER_OK);
	}

#ifdef INVARIANTS
	blkno0 = bp->b_blkno;
#endif
	bp->b_blkno += (foff % bsize) / DEV_BSIZE;

	/* Recalculate blocks available after/before to pages. */
	poff = (foff % bsize) / PAGE_SIZE;
	before *= pagesperblock;
	before += poff;
	after *= pagesperblock;
	after += pagesperblock - (poff + 1);
	if (m[0]->pindex + after >= object->size)
		after = object->size - 1 - m[0]->pindex;
	KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d",
	    __func__, count, after + 1));
	after -= count - 1;

	/* Trim requested rbehind/rahead to possible values. */   
	rbehind = a_rbehind ? *a_rbehind : 0;
	rahead = a_rahead ? *a_rahead : 0;
	rbehind = min(rbehind, before);
	rbehind = min(rbehind, m[0]->pindex);
	rahead = min(rahead, after);
	rahead = min(rahead, object->size - m[count - 1]->pindex);
	/*
	 * Check that total amount of pages fit into buf.  Trim rbehind and
	 * rahead evenly if not.
	 */
	if (rbehind + rahead + count > nitems(bp->b_pages)) {
		int trim, sum;

		trim = rbehind + rahead + count - nitems(bp->b_pages) + 1;
		sum = rbehind + rahead;
		if (rbehind == before) {
			/* Roundup rbehind trim to block size. */
			rbehind -= roundup(trim * rbehind / sum, pagesperblock);
			if (rbehind < 0)
				rbehind = 0;
		} else
			rbehind -= trim * rbehind / sum;
		rahead -= trim * rahead / sum;
	}
	KASSERT(rbehind + rahead + count <= nitems(bp->b_pages),
	    ("%s: behind %d ahead %d count %d", __func__,
	    rbehind, rahead, count));

	/*
	 * Fill in the bp->b_pages[] array with requested and optional   
	 * read behind or read ahead pages.  Read behind pages are looked
	 * up in a backward direction, down to a first cached page.  Same
	 * for read ahead pages, but there is no need to shift the array
	 * in case of encountering a cached page.
	 */
	i = bp->b_npages = 0;
	if (rbehind) {
		vm_pindex_t startpindex, tpindex;
		vm_page_t p;

		VM_OBJECT_WLOCK(object);
		startpindex = m[0]->pindex - rbehind;
		if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL &&
		    p->pindex >= startpindex)
			startpindex = p->pindex + 1;

		/* tpindex is unsigned; beware of numeric underflow. */
		for (tpindex = m[0]->pindex - 1;
		    tpindex >= startpindex && tpindex < m[0]->pindex;
		    tpindex--, i++) {
			p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
			if (p == NULL) {
				/* Shift the array. */
				for (int j = 0; j < i; j++)
					bp->b_pages[j] = bp->b_pages[j + 
					    tpindex + 1 - startpindex]; 
				break;
			}
			bp->b_pages[tpindex - startpindex] = p;
		}

		bp->b_pgbefore = i;
		bp->b_npages += i;
		bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE;
	} else
		bp->b_pgbefore = 0;

	/* Requested pages. */
	for (int j = 0; j < count; j++, i++)
		bp->b_pages[i] = m[j];
	bp->b_npages += count;

	if (rahead) {
		vm_pindex_t endpindex, tpindex;
		vm_page_t p;

		if (!VM_OBJECT_WOWNED(object))
			VM_OBJECT_WLOCK(object);
		endpindex = m[count - 1]->pindex + rahead + 1;
		if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL &&
		    p->pindex < endpindex)
			endpindex = p->pindex;
		if (endpindex > object->size)
			endpindex = object->size;

		for (tpindex = m[count - 1]->pindex + 1;
		    tpindex < endpindex; i++, tpindex++) {
			p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
			if (p == NULL)
				break;
			bp->b_pages[i] = p;
		}

		bp->b_pgafter = i - bp->b_npages;
		bp->b_npages = i;
	} else
		bp->b_pgafter = 0;

	if (VM_OBJECT_WOWNED(object))
		VM_OBJECT_WUNLOCK(object);

	/* Report back actual behind/ahead read. */
	if (a_rbehind)
		*a_rbehind = bp->b_pgbefore;
	if (a_rahead)
		*a_rahead = bp->b_pgafter;

#ifdef INVARIANTS
	KASSERT(bp->b_npages <= nitems(bp->b_pages),
	    ("%s: buf %p overflowed", __func__, bp));
	for (int j = 1; j < bp->b_npages; j++)
		KASSERT(bp->b_pages[j]->pindex - 1 ==
		    bp->b_pages[j - 1]->pindex,
		    ("%s: pages array not consecutive, bp %p", __func__, bp));
#endif

	/*
	 * Recalculate first offset and bytecount with regards to read behind.
	 * Truncate bytecount to vnode real size and round up physical size
	 * for real devices.
	 */
	foff = IDX_TO_OFF(bp->b_pages[0]->pindex);
	bytecount = bp->b_npages << PAGE_SHIFT;
	if ((foff + bytecount) > object->un_pager.vnp.vnp_size)
		bytecount = object->un_pager.vnp.vnp_size - foff;
	secmask = bo->bo_bsize - 1;
	KASSERT(secmask < PAGE_SIZE && secmask > 0,
	    ("%s: sector size %d too large", __func__, secmask + 1));
	bytecount = (bytecount + secmask) & ~secmask;

	/*
	 * And map the pages to be read into the kva, if the filesystem
	 * requires mapped buffers.
	 */
	if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
	    unmapped_buf_allowed) {
		bp->b_data = unmapped_buf;
		bp->b_offset = 0;
	} else {
		bp->b_data = bp->b_kvabase;
		pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
	}

	/* Build a minimal buffer header. */
	bp->b_iocmd = BIO_READ;
	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
	bp->b_rcred = crhold(curthread->td_ucred);
	bp->b_wcred = crhold(curthread->td_ucred);
	pbgetbo(bo, bp);
	bp->b_vp = vp;
	bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount;
	bp->b_iooffset = dbtob(bp->b_blkno);
	KASSERT(IDX_TO_OFF(m[0]->pindex - bp->b_pages[0]->pindex) ==
	    (blkno0 - bp->b_blkno) * DEV_BSIZE +
	    IDX_TO_OFF(m[0]->pindex) % bsize,
	    ("wrong offsets bsize %d m[0] %ju b_pages[0] %ju "
	    "blkno0 %ju b_blkno %ju", bsize,
	    (uintmax_t)m[0]->pindex, (uintmax_t)bp->b_pages[0]->pindex,
	    (uintmax_t)blkno0, (uintmax_t)bp->b_blkno));

	atomic_add_long(&runningbufspace, bp->b_runningbufspace);
	PCPU_INC(cnt.v_vnodein);
	PCPU_ADD(cnt.v_vnodepgsin, bp->b_npages);

	if (iodone != NULL) { /* async */
		bp->b_pgiodone = iodone;
		bp->b_caller1 = arg;
		bp->b_iodone = vnode_pager_generic_getpages_done_async;
		bp->b_flags |= B_ASYNC;
		BUF_KERNPROC(bp);
		bstrategy(bp);
		return (VM_PAGER_OK);
	} else {
		bp->b_iodone = bdone;
		bstrategy(bp);
		bwait(bp, PVM, "vnread");
		error = vnode_pager_generic_getpages_done(bp);
		for (i = 0; i < bp->b_npages; i++)
			bp->b_pages[i] = NULL;
		bp->b_vp = NULL;
		pbrelbo(bp);
		relpbuf(bp, &vnode_pbuf_freecnt);
		return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
	}
}
void
tmp_memfree(void *cp, size_t size)
{
	kmem_free(cp, size);
	atomic_add_long(&tmp_kmemspace, -size);
}
Beispiel #14
0
/*
 * small block filesystem vnode pager input
 */
static int
vnode_pager_input_smlfs(vm_object_t object, vm_page_t m)
{
	struct vnode *vp;
	struct bufobj *bo;
	struct buf *bp;
	struct sf_buf *sf;
	daddr_t fileaddr;
	vm_offset_t bsize;
	vm_page_bits_t bits;
	int error, i;

	error = 0;
	vp = object->handle;
	if (vp->v_iflag & VI_DOOMED)
		return VM_PAGER_BAD;

	bsize = vp->v_mount->mnt_stat.f_iosize;

	VOP_BMAP(vp, 0, &bo, 0, NULL, NULL);

	sf = sf_buf_alloc(m, 0);

	for (i = 0; i < PAGE_SIZE / bsize; i++) {
		vm_ooffset_t address;

		bits = vm_page_bits(i * bsize, bsize);
		if (m->valid & bits)
			continue;

		address = IDX_TO_OFF(m->pindex) + i * bsize;
		if (address >= object->un_pager.vnp.vnp_size) {
			fileaddr = -1;
		} else {
			error = vnode_pager_addr(vp, address, &fileaddr, NULL);
			if (error)
				break;
		}
		if (fileaddr != -1) {
			bp = getpbuf(&vnode_pbuf_freecnt);

			/* build a minimal buffer header */
			bp->b_iocmd = BIO_READ;
			bp->b_iodone = bdone;
			KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
			KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
			bp->b_rcred = crhold(curthread->td_ucred);
			bp->b_wcred = crhold(curthread->td_ucred);
			bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize;
			bp->b_blkno = fileaddr;
			pbgetbo(bo, bp);
			bp->b_vp = vp;
			bp->b_bcount = bsize;
			bp->b_bufsize = bsize;
			bp->b_runningbufspace = bp->b_bufsize;
			atomic_add_long(&runningbufspace, bp->b_runningbufspace);

			/* do the input */
			bp->b_iooffset = dbtob(bp->b_blkno);
			bstrategy(bp);

			bwait(bp, PVM, "vnsrd");

			if ((bp->b_ioflags & BIO_ERROR) != 0)
				error = EIO;

			/*
			 * free the buffer header back to the swap buffer pool
			 */
			bp->b_vp = NULL;
			pbrelbo(bp);
			relpbuf(bp, &vnode_pbuf_freecnt);
			if (error)
				break;
		} else
			bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize);
		KASSERT((m->dirty & bits) == 0,
		    ("vnode_pager_input_smlfs: page %p is dirty", m));
		VM_OBJECT_WLOCK(object);
		m->valid |= bits;
		VM_OBJECT_WUNLOCK(object);
	}
	sf_buf_free(sf);
	if (error) {
		return VM_PAGER_ERROR;
	}
	return VM_PAGER_OK;
}
Beispiel #15
0
/*
 * Attempt to acquire a shared or exclusive token.  Returns TRUE on success,
 * FALSE on failure.
 *
 * If TOK_EXCLUSIVE is set in mode we are attempting to get an exclusive
 * token, otherwise are attempting to get a shared token.
 *
 * If TOK_EXCLREQ is set in mode this is a blocking operation, otherwise
 * it is a non-blocking operation (for both exclusive or shared acquisions).
 */
static __inline
int
_lwkt_trytokref(lwkt_tokref_t ref, thread_t td, long mode)
{
	lwkt_token_t tok;
	lwkt_tokref_t oref;
	long count;

	tok = ref->tr_tok;
	KASSERT(((mode & TOK_EXCLREQ) == 0 ||	/* non blocking */
		td->td_gd->gd_intr_nesting_level == 0 ||
		panic_cpu_gd == mycpu),
		("Attempt to acquire token %p not already "
		"held in hard code section", tok));

	if (mode & TOK_EXCLUSIVE) {
		/*
		 * Attempt to get an exclusive token
		 */
		for (;;) {
			count = tok->t_count;
			oref = tok->t_ref;	/* can be NULL */
			cpu_ccfence();
			if ((count & ~TOK_EXCLREQ) == 0) {
				/*
				 * It is possible to get the exclusive bit.
				 * We must clear TOK_EXCLREQ on successful
				 * acquisition.
				 */
				if (atomic_cmpset_long(&tok->t_count, count,
						       (count & ~TOK_EXCLREQ) |
						       TOK_EXCLUSIVE)) {
					KKASSERT(tok->t_ref == NULL);
					tok->t_ref = ref;
					return TRUE;
				}
				/* retry */
			} else if ((count & TOK_EXCLUSIVE) &&
				   oref >= &td->td_toks_base &&
				   oref < td->td_toks_stop) {
				/*
				 * Our thread already holds the exclusive
				 * bit, we treat this tokref as a shared
				 * token (sorta) to make the token release
				 * code easier.
				 *
				 * NOTE: oref cannot race above if it
				 *	 happens to be ours, so we're good.
				 *	 But we must still have a stable
				 *	 variable for both parts of the
				 *	 comparison.
				 *
				 * NOTE: Since we already have an exclusive
				 *	 lock and don't need to check EXCLREQ
				 *	 we can just use an atomic_add here
				 */
				atomic_add_long(&tok->t_count, TOK_INCR);
				ref->tr_count &= ~TOK_EXCLUSIVE;
				return TRUE;
			} else if ((mode & TOK_EXCLREQ) &&
				   (count & TOK_EXCLREQ) == 0) {
				/*
				 * Unable to get the exclusive bit but being
				 * asked to set the exclusive-request bit.
				 * Since we are going to retry anyway just
				 * set the bit unconditionally.
				 */
				atomic_set_long(&tok->t_count, TOK_EXCLREQ);
				return FALSE;
			} else {
				/*
				 * Unable to get the exclusive bit and not
				 * being asked to set the exclusive-request
				 * (aka lwkt_trytoken()), or EXCLREQ was
				 * already set.
				 */
				cpu_pause();
				return FALSE;
			}
			/* retry */
		}
	} else {
		/*
		 * Attempt to get a shared token.  Note that TOK_EXCLREQ
		 * for shared tokens simply means the caller intends to
		 * block.  We never actually set the bit in tok->t_count.
		 */
		for (;;) {
			count = tok->t_count;
			oref = tok->t_ref;	/* can be NULL */
			cpu_ccfence();
			if ((count & (TOK_EXCLUSIVE/*|TOK_EXCLREQ*/)) == 0) {
				/*
				 * It may be possible to get the token shared.
				 */
				if ((atomic_fetchadd_long(&tok->t_count, TOK_INCR) & TOK_EXCLUSIVE) == 0) {
					return TRUE;
				}
				atomic_fetchadd_long(&tok->t_count, -TOK_INCR);
				/* retry */
			} else if ((count & TOK_EXCLUSIVE) &&
				   oref >= &td->td_toks_base &&
				   oref < td->td_toks_stop) {
				/*
				 * We own the exclusive bit on the token so
				 * we can in fact also get it shared.
				 */
				atomic_add_long(&tok->t_count, TOK_INCR);
				return TRUE;
			} else {
				/*
				 * We failed to get the token shared
				 */
				return FALSE;
			}
			/* retry */
		}
	}
}
Beispiel #16
0
/*
 * Disable logging
 */
int
lqfs_disable(vnode_t *vp, struct fiolog *flp)
{
	int		error = 0;
	inode_t		*ip = VTOI(vp);
	qfsvfs_t	*qfsvfsp = ip->i_qfsvfs;
	fs_lqfs_common_t	*fs = VFS_FS_PTR(qfsvfsp);
#ifdef LUFS
	struct lockfs	lf;
	struct ulockfs	*ulp;
#else
	/* QFS doesn't really support LOCKFS. */
#endif /* LUFS */

	flp->error = FIOLOG_ENONE;

	/*
	 * Logging is already disabled; done
	 */
	if (LQFS_GET_LOGBNO(fs) == 0 || LQFS_GET_LOGP(qfsvfsp) == NULL ||
	    !LQFS_CAPABLE(qfsvfsp)) {
		vfs_setmntopt(qfsvfsp->vfs_vfs, MNTOPT_NOLOGGING, NULL, 0);
		error = 0;
		goto out;
	}

#ifdef LUFS
	/*
	 * File system must be write locked to disable logging
	 */
	error = qfs_fiolfss(vp, &lf);
	if (error) {
		goto out;
	}
	if (!LOCKFS_IS_ULOCK(&lf)) {
		flp->error = FIOLOG_EULOCK;
		error = 0;
		goto out;
	}
	lf.lf_lock = LOCKFS_WLOCK;
	lf.lf_flags = 0;
	lf.lf_comment = NULL;
	error = qfs_fiolfs(vp, &lf, 1);
	if (error) {
		flp->error = FIOLOG_EWLOCK;
		error = 0;
		goto out;
	}
#else
	/* QFS doesn't really support LOCKFS. */
#endif /* LUFS */

	if (LQFS_GET_LOGP(qfsvfsp) == NULL || LQFS_GET_LOGBNO(fs) == 0) {
		goto errout;
	}

	/*
	 * WE ARE COMMITTED TO DISABLING LOGGING PAST THIS POINT
	 */

	/*
	 * Disable logging:
	 * Suspend the reclaim thread and force the delete thread to exit.
	 *	When a nologging mount has completed there may still be
	 *	work for reclaim to do so just suspend this thread until
	 *	it's [deadlock-] safe for it to continue.  The delete
	 *	thread won't be needed as qfs_iinactive() calls
	 *	qfs_delete() when logging is disabled.
	 * Freeze and drain reader ops.
	 *	Commit any outstanding reader transactions (lqfs_flush).
	 *	Set the ``unmounted'' bit in the qfstrans struct.
	 *	If debug, remove metadata from matamap.
	 *	Disable matamap processing.
	 *	NULL the trans ops table.
	 *	Free all of the incore structs related to logging.
	 * Allow reader ops.
	 */
#ifdef LUFS
	qfs_thread_suspend(&qfsvfsp->vfs_reclaim);
	qfs_thread_exit(&qfsvfsp->vfs_delete);
#else
	/* QFS doesn't have file reclaim nor i-node delete threads. */
#endif /* LUFS */

	vfs_lock_wait(qfsvfsp->vfs_vfs);
#ifdef LQFS_TODO_LOCKFS
	ulp = &qfsvfsp->vfs_ulockfs;
	mutex_enter(&ulp->ul_lock);
	(void) qfs_quiesce(ulp);
#else
	/* QFS doesn't really support LOCKFS. */
#endif /* LQFS_TODO_LOCKFS */

#ifdef LQFS_TODO
	(void) qfs_flush(qfsvfsp->vfs_vfs);
#else
	(void) lqfs_flush(qfsvfsp);
	if (LQFS_GET_LOGP(qfsvfsp)) {
		logmap_start_roll(LQFS_GET_LOGP(qfsvfsp));
	}
#endif /* LQFS_TODO */

	TRANS_MATA_UMOUNT(qfsvfsp);
	LQFS_SET_DOMATAMAP(qfsvfsp, 0);

	/*
	 * Free all of the incore structs
	 * Aquire the ufs_scan_lock before de-linking the mtm data
	 * structure so that we keep ufs_sync() and ufs_update() away
	 * when they execute the ufs_scan_inodes() run while we're in
	 * progress of enabling/disabling logging.
	 */
	mutex_enter(&qfs_scan_lock);
	(void) lqfs_unsnarf(qfsvfsp);
	mutex_exit(&qfs_scan_lock);

#ifdef LQFS_TODO_LOCKFS
	atomic_add_long(&ufs_quiesce_pend, -1);
	mutex_exit(&ulp->ul_lock);
#else
	/* QFS doesn't do this yet. */
#endif /* LQFS_TODO_LOCKFS */
	vfs_setmntopt(qfsvfsp->vfs_vfs, MNTOPT_NOLOGGING, NULL, 0);
	vfs_unlock(qfsvfsp->vfs_vfs);

	LQFS_SET_FS_ROLLED(fs, FS_ALL_ROLLED);
	LQFS_SET_NOLOG_SI(qfsvfsp, 0);

	/*
	 * Free the log space and mark the superblock as FSACTIVE
	 */
	(void) lqfs_free(qfsvfsp);

#ifdef LUFS
	/*
	 * Allow the reclaim thread to continue.
	 */
	qfs_thread_continue(&qfsvfsp->vfs_reclaim);
#else
	/* QFS doesn't have a file reclaim thread. */
#endif /* LUFS */

#ifdef LQFS_TODO_LOCKFS
	/*
	 * Unlock the file system
	 */
	lf.lf_lock = LOCKFS_ULOCK;
	lf.lf_flags = 0;
	error = qfs_fiolfs(vp, &lf, 1);
	if (error) {
		flp->error = FIOLOG_ENOULOCK;
	}
#else
	/* QFS doesn't really support LOCKFS. */
#endif /* LQFS_LOCKFS */

	error = 0;
	goto out;

errout:
#ifdef LQFS_LOCKFS
	lf.lf_lock = LOCKFS_ULOCK;
	lf.lf_flags = 0;
	(void) qfs_fiolfs(vp, &lf, 1);
#else
	/* QFS doesn't really support LOCKFS. */
#endif /* LQFS_LOCKFS */

out:
	mutex_enter(&ip->mp->ms.m_waitwr_mutex);
	ip->mp->mt.fi_status |= FS_LOGSTATE_KNOWN;
	mutex_exit(&ip->mp->ms.m_waitwr_mutex);
	return (error);
}
Beispiel #17
0
/*
 * Resizes the aobj associated to the regular file pointed to by vp to
 * the size newsize.  'vp' must point to a vnode that represents a regular
 * file.  'newsize' must be positive.
 *
 * pass trivial as 1 when buf content will be overwritten, otherwise set 0
 * to be zero filled.
 *
 * Returns zero on success or an appropriate error code on failure.
 *
 * Caller must hold the node exclusively locked.
 */
int
tmpfs_reg_resize(struct vnode *vp, off_t newsize, int trivial)
{
	int error;
	vm_pindex_t newpages, oldpages;
	struct tmpfs_mount *tmp;
	struct tmpfs_node *node;
	off_t oldsize;

#ifdef INVARIANTS
	KKASSERT(vp->v_type == VREG);
	KKASSERT(newsize >= 0);
#endif

	node = VP_TO_TMPFS_NODE(vp);
	tmp = VFS_TO_TMPFS(vp->v_mount);

	/*
	 * Convert the old and new sizes to the number of pages needed to
	 * store them.  It may happen that we do not need to do anything
	 * because the last allocated page can accommodate the change on
	 * its own.
	 */
	oldsize = node->tn_size;
	oldpages = round_page64(oldsize) / PAGE_SIZE;
	KKASSERT(oldpages == node->tn_reg.tn_aobj_pages);
	newpages = round_page64(newsize) / PAGE_SIZE;

	if (newpages > oldpages &&
	   tmp->tm_pages_used + newpages - oldpages > tmp->tm_pages_max) {
		error = ENOSPC;
		goto out;
	}
	node->tn_reg.tn_aobj_pages = newpages;
	node->tn_size = newsize;

	if (newpages != oldpages)
		atomic_add_long(&tmp->tm_pages_used, (newpages - oldpages));

	/*
	 * When adjusting the vnode filesize and its VM object we must
	 * also adjust our backing VM object (aobj).  The blocksize
	 * used must match the block sized we use for the buffer cache.
	 *
	 * The backing VM object may contain VM pages as well as swap
	 * assignments if we previously renamed main object pages into
	 * it during deactivation.
	 */
	if (newsize < oldsize) {
		vm_pindex_t osize;
		vm_pindex_t nsize;
		vm_object_t aobj;

		error = nvtruncbuf(vp, newsize, TMPFS_BLKSIZE, -1, 0);
		aobj = node->tn_reg.tn_aobj;
		if (aobj) {
			osize = aobj->size;
			nsize = vp->v_object->size;
			if (nsize < osize) {
				aobj->size = osize;
				swap_pager_freespace(aobj, nsize,
						     osize - nsize);
				vm_object_page_remove(aobj, nsize, osize,
						      FALSE);
			}
		}
	} else {
		vm_object_t aobj;

		error = nvextendbuf(vp, oldsize, newsize,
				    TMPFS_BLKSIZE, TMPFS_BLKSIZE,
				    -1, -1, trivial);
		aobj = node->tn_reg.tn_aobj;
		if (aobj)
			aobj->size = vp->v_object->size;
	}

out:
	return error;
}
Beispiel #18
0
/*
 * This does the real work of segkp allocation.
 * Return to client base addr. len must be page-aligned. A null value is
 * returned if there are no more vm resources (e.g. pages, swap). The len
 * and base recorded in the private data structure include the redzone
 * and the redzone length (if applicable). If the user requests a redzone
 * either the first or last page is left unmapped depending whether stacks
 * grow to low or high memory.
 *
 * The client may also specify a no-wait flag. If that is set then the
 * request will choose a non-blocking path when requesting resources.
 * The default is make the client wait.
 */
static caddr_t
segkp_get_internal(
	struct seg *seg,
	size_t len,
	uint_t flags,
	struct segkp_data **tkpd,
	struct anon_map *amp)
{
	struct segkp_segdata	*kpsd = (struct segkp_segdata *)seg->s_data;
	struct segkp_data	*kpd;
	caddr_t vbase = NULL;	/* always first virtual, may not be mapped */
	pgcnt_t np = 0;		/* number of pages in the resource */
	pgcnt_t segkpindex;
	long i;
	caddr_t va;
	pgcnt_t pages = 0;
	ulong_t anon_idx = 0;
	int kmflag = (flags & KPD_NOWAIT) ? KM_NOSLEEP : KM_SLEEP;
	caddr_t s_base = (segkp_fromheap) ? kvseg.s_base : seg->s_base;

	if (len & PAGEOFFSET) {
		panic("segkp_get: len is not page-aligned");
		/*NOTREACHED*/
	}

	ASSERT(((flags & KPD_HASAMP) == 0) == (amp == NULL));

	/* Only allow KPD_NO_ANON if we are going to lock it down */
	if ((flags & (KPD_LOCKED|KPD_NO_ANON)) == KPD_NO_ANON)
		return (NULL);

	if ((kpd = kmem_zalloc(sizeof (struct segkp_data), kmflag)) == NULL)
		return (NULL);
	/*
	 * Fix up the len to reflect the REDZONE if applicable
	 */
	if (flags & KPD_HASREDZONE)
		len += PAGESIZE;
	np = btop(len);

	vbase = vmem_alloc(SEGKP_VMEM(seg), len, kmflag | VM_BESTFIT);
	if (vbase == NULL) {
		kmem_free(kpd, sizeof (struct segkp_data));
		return (NULL);
	}

	/* If locking, reserve physical memory */
	if (flags & KPD_LOCKED) {
		pages = btop(SEGKP_MAPLEN(len, flags));
		if (page_resv(pages, kmflag) == 0) {
			vmem_free(SEGKP_VMEM(seg), vbase, len);
			kmem_free(kpd, sizeof (struct segkp_data));
			return (NULL);
		}
		if ((flags & KPD_NO_ANON) == 0)
			atomic_add_long(&anon_segkp_pages_locked, pages);
	}

	/*
	 * Reserve sufficient swap space for this vm resource.  We'll
	 * actually allocate it in the loop below, but reserving it
	 * here allows us to back out more gracefully than if we
	 * had an allocation failure in the body of the loop.
	 *
	 * Note that we don't need swap space for the red zone page.
	 */
	if (amp != NULL) {
		/*
		 * The swap reservation has been done, if required, and the
		 * anon_hdr is separate.
		 */
		anon_idx = 0;
		kpd->kp_anon_idx = anon_idx;
		kpd->kp_anon = amp->ahp;

		TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
		    kpd, vbase, len, flags, 1);

	} else if ((flags & KPD_NO_ANON) == 0) {
		if (anon_resv_zone(SEGKP_MAPLEN(len, flags), NULL) == 0) {
			if (flags & KPD_LOCKED) {
				atomic_add_long(&anon_segkp_pages_locked,
				    -pages);
				page_unresv(pages);
			}
			vmem_free(SEGKP_VMEM(seg), vbase, len);
			kmem_free(kpd, sizeof (struct segkp_data));
			return (NULL);
		}
		atomic_add_long(&anon_segkp_pages_resv,
		    btop(SEGKP_MAPLEN(len, flags)));
		anon_idx = ((uintptr_t)(vbase - s_base)) >> PAGESHIFT;
		kpd->kp_anon_idx = anon_idx;
		kpd->kp_anon = kpsd->kpsd_anon;

		TRACE_5(TR_FAC_VM, TR_ANON_SEGKP, "anon segkp:%p %p %lu %u %u",
		    kpd, vbase, len, flags, 1);
	} else {
Beispiel #19
0
/*
 * The passed-in chain must be locked and the returned inode will also be
 * locked.  This routine typically locates or allocates the inode, assigns
 * ip->chain (adding a ref to chain if necessary), and returns the inode.
 *
 * The hammer2_inode structure regulates the interface between the high level
 * kernel VNOPS API and the filesystem backend (the chains).
 *
 * WARNING!  This routine sucks up the chain's lock (makes it part of the
 *	     inode lock from the point of view of the inode lock API),
 *	     so callers need to be careful.
 *
 * WARNING!  The mount code is allowed to pass dip == NULL for iroot and
 *	     is allowed to pass pmp == NULL and dip == NULL for sroot.
 */
hammer2_inode_t *
hammer2_inode_get(hammer2_pfsmount_t *pmp, hammer2_inode_t *dip,
		  hammer2_chain_t *chain)
{
	hammer2_inode_t *nip;

	KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INODE);

	/*
	 * Interlocked lookup/ref of the inode.  This code is only needed
	 * when looking up inodes with nlinks != 0 (TODO: optimize out
	 * otherwise and test for duplicates).
	 */
again:
	for (;;) {
		nip = hammer2_inode_lookup(pmp, chain->data->ipdata.inum);
		if (nip == NULL)
			break;
		ccms_thread_lock(&nip->topo_cst, CCMS_STATE_EXCLUSIVE);
		if ((nip->flags & HAMMER2_INODE_ONRBTREE) == 0) { /* race */
			ccms_thread_unlock(&nip->topo_cst);
			hammer2_inode_drop(nip);
			continue;
		}
		if (nip->chain != chain)
			hammer2_inode_repoint(nip, NULL, chain);

		/*
		 * Consolidated nip/nip->chain is locked (chain locked
		 * by caller).
		 */
		return nip;
	}

	/*
	 * We couldn't find the inode number, create a new inode.
	 */
	if (pmp) {
		nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO);
		atomic_add_long(&pmp->inmem_inodes, 1);
		hammer2_chain_memory_inc(pmp);
		hammer2_chain_memory_wakeup(pmp);
	} else {
		nip = kmalloc(sizeof(*nip), M_HAMMER2, M_WAITOK | M_ZERO);
		nip->flags = HAMMER2_INODE_SROOT;
	}
	nip->inum = chain->data->ipdata.inum;
	nip->size = chain->data->ipdata.size;
	nip->mtime = chain->data->ipdata.mtime;
	hammer2_inode_repoint(nip, NULL, chain);
	nip->pip = dip;				/* can be NULL */
	if (dip)
		hammer2_inode_ref(dip);	/* ref dip for nip->pip */

	nip->pmp = pmp;

	/*
	 * ref and lock on nip gives it state compatible to after a
	 * hammer2_inode_lock_ex() call.
	 */
	nip->refs = 1;
	ccms_cst_init(&nip->topo_cst, &nip->chain);
	ccms_thread_lock(&nip->topo_cst, CCMS_STATE_EXCLUSIVE);
	/* combination of thread lock and chain lock == inode lock */

	/*
	 * Attempt to add the inode.  If it fails we raced another inode
	 * get.  Undo all the work and try again.
	 */
	if (pmp) {
		spin_lock(&pmp->inum_spin);
		if (RB_INSERT(hammer2_inode_tree, &pmp->inum_tree, nip)) {
			spin_unlock(&pmp->inum_spin);
			ccms_thread_unlock(&nip->topo_cst);
			hammer2_inode_drop(nip);
			goto again;
		}
		atomic_set_int(&nip->flags, HAMMER2_INODE_ONRBTREE);
		spin_unlock(&pmp->inum_spin);
	}

	return (nip);
}
static vnode_t *
make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp,
    struct vnodeops *vops,
    int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
    int *newnode, cred_t *cr)
{
	rnode4_t *rp;
	rnode4_t *trp;
	vnode_t *vp;
	mntinfo4_t *mi;

	ASSERT(RW_READ_HELD(&rhtp->r_lock));

	mi = VFTOMI4(vfsp);

start:
	if ((rp = r4find(rhtp, fh, vfsp)) != NULL) {
		vp = RTOV4(rp);
		*newnode = 0;
		return (vp);
	}
	rw_exit(&rhtp->r_lock);

	mutex_enter(&rp4freelist_lock);

	if (rp4freelist != NULL && rnode4_new >= nrnode) {
		rp = rp4freelist;
		rp4_rmfree(rp);
		mutex_exit(&rp4freelist_lock);

		vp = RTOV4(rp);

		if (rp->r_flags & R4HASHED) {
			rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
			mutex_enter(&vp->v_lock);
			if (vp->v_count > 1) {
				vp->v_count--;
				mutex_exit(&vp->v_lock);
				rw_exit(&rp->r_hashq->r_lock);
				rw_enter(&rhtp->r_lock, RW_READER);
				goto start;
			}
			mutex_exit(&vp->v_lock);
			rp4_rmhash_locked(rp);
			rw_exit(&rp->r_hashq->r_lock);
		}

		r4inactive(rp, cr);

		mutex_enter(&vp->v_lock);
		if (vp->v_count > 1) {
			vp->v_count--;
			mutex_exit(&vp->v_lock);
			rw_enter(&rhtp->r_lock, RW_READER);
			goto start;
		}
		mutex_exit(&vp->v_lock);
		vn_invalid(vp);

		/*
		 * destroy old locks before bzero'ing and
		 * recreating the locks below.
		 */
		uninit_rnode4(rp);

		/*
		 * Make sure that if rnode is recycled then
		 * VFS count is decremented properly before
		 * reuse.
		 */
		VFS_RELE(vp->v_vfsp);
		vn_reinit(vp);
	} else {
		vnode_t *new_vp;

		mutex_exit(&rp4freelist_lock);

		rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP);
		new_vp = vn_alloc(KM_SLEEP);

		atomic_add_long((ulong_t *)&rnode4_new, 1);
#ifdef DEBUG
		clstat4_debug.nrnode.value.ui64++;
#endif
		vp = new_vp;
	}

	bzero(rp, sizeof (*rp));
	rp->r_vnode = vp;
	nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
	nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
	mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL);
	rp->created_v4 = 0;
	list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t),
	    offsetof(nfs4_open_stream_t, os_node));
	rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head;
	rp->r_lo_head.lo_next_rnode = &rp->r_lo_head;
	cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
	rp->r_flags = R4READDIRWATTR;
	rp->r_fh = fh;
	rp->r_hashq = rhtp;
	sfh4_hold(rp->r_fh);
	rp->r_server = mi->mi_curr_serv;
	rp->r_deleg_type = OPEN_DELEGATE_NONE;
	rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
	nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL);

	rddir4_cache_create(rp);
	rp->r_putapage = putapage;
	vn_setops(vp, vops);
	vp->v_data = (caddr_t)rp;
	vp->v_vfsp = vfsp;
	VFS_HOLD(vfsp);
	vp->v_type = VNON;
	if (isrootfh(fh, rp))
		vp->v_flag = VROOT;
	vn_exists(vp);

	/*
	 * There is a race condition if someone else
	 * alloc's the rnode while no locks are held, so we
	 * check again and recover if found.
	 */
	rw_enter(&rhtp->r_lock, RW_WRITER);
	if ((trp = r4find(rhtp, fh, vfsp)) != NULL) {
		vp = RTOV4(trp);
		*newnode = 0;
		rw_exit(&rhtp->r_lock);
		rp4_addfree(rp, cr);
		rw_enter(&rhtp->r_lock, RW_READER);
		return (vp);
	}
	rp4_addhash(rp);
	*newnode = 1;
	return (vp);
}
Beispiel #21
0
/*
 * Destroys the node pointed to by node from the file system 'tmp'.
 * If the node does not belong to the given mount point, the results are
 * unpredicted.
 *
 * If the node references a directory; no entries are allowed because
 * their removal could need a recursive algorithm, something forbidden in
 * kernel space.  Furthermore, there is not need to provide such
 * functionality (recursive removal) because the only primitives offered
 * to the user are the removal of empty directories and the deletion of
 * individual files.
 *
 * Note that nodes are not really deleted; in fact, when a node has been
 * allocated, it cannot be deleted during the whole life of the file
 * system.  Instead, they are moved to the available list and remain there
 * until reused.
 *
 * A caller must have TMPFS_NODE_LOCK(node) and this function unlocks it.
 */
void
tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node)
{
	vm_pindex_t pages = 0;

#ifdef INVARIANTS
	TMPFS_ASSERT_ELOCKED(node);
	KKASSERT(node->tn_vnode == NULL);
	KKASSERT((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0);
#endif

	TMPFS_LOCK(tmp);
	LIST_REMOVE(node, tn_entries);
	tmp->tm_nodes_inuse--;
	TMPFS_UNLOCK(tmp);
	TMPFS_NODE_UNLOCK(node);  /* Caller has this lock */

	switch (node->tn_type) {
	case VNON:
		/* Do not do anything.  VNON is provided to let the
		 * allocation routine clean itself easily by avoiding
		 * duplicating code in it. */
		/* FALLTHROUGH */
	case VBLK:
		/* FALLTHROUGH */
	case VCHR:
		/* FALLTHROUGH */
		break;
	case VDIR:
		/*
		 * The parent link can be NULL if this is the root
		 * node or if it is a directory node that was rmdir'd.
		 *
		 * XXX what if node is a directory which still contains
		 * directory entries (e.g. due to a forced umount) ?
		 */
		node->tn_size = 0;
		KKASSERT(node->tn_dir.tn_parent == NULL);

		/*
		 * If the root node is being destroyed don't leave a
		 * dangling pointer in tmpfs_mount.
		 */
		if (node == tmp->tm_root)
			tmp->tm_root = NULL;
		break;
	case VFIFO:
		/* FALLTHROUGH */
	case VSOCK:
		break;

	case VLNK:
		kfree(node->tn_link, tmp->tm_name_zone);
		node->tn_link = NULL;
		node->tn_size = 0;
		break;

	case VREG:
		if (node->tn_reg.tn_aobj != NULL)
			vm_object_deallocate(node->tn_reg.tn_aobj);
		node->tn_reg.tn_aobj = NULL;
		pages = node->tn_reg.tn_aobj_pages;
		break;

	default:
		panic("tmpfs_free_node: type %p %d", node, (int)node->tn_type);
	}

	/*
	 * Clean up fields for the next allocation.  The objcache only ctors
	 * new allocations.
	 */
	tmpfs_node_ctor(node, NULL, 0);
	objcache_put(tmp->tm_node_pool, node);
	/* node is now invalid */

	if (pages)
		atomic_add_long(&tmp->tm_pages_used, -(long)pages);
}
Beispiel #22
0
void
balloon_drv_subtracted(int64_t delta)
{
	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta);
}
Beispiel #23
0
/*
 * XXX this API needs a rewrite.  It needs to be split into a
 * hammer2_inode_alloc() and hammer2_inode_build() to allow us to get
 * rid of the inode/chain lock reversal fudge.
 *
 * Returns the inode associated with the passed-in cluster, allocating a new
 * hammer2_inode structure if necessary, then synchronizing it to the passed
 * xop cluster.  When synchronizing, if idx >= 0, only cluster index (idx)
 * is synchronized.  Otherwise the whole cluster is synchronized.  inum will
 * be extracted from the passed-in xop and the inum argument will be ignored.
 *
 * If xop is passed as NULL then a new hammer2_inode is allocated with the
 * specified inum, and returned.   For normal inodes, the inode will be
 * indexed in memory and if it already exists the existing ip will be
 * returned instead of allocating a new one.  The superroot and PFS inodes
 * are not indexed in memory.
 *
 * The passed-in cluster must be locked and will remain locked on return.
 * The returned inode will be locked and the caller may dispose of both
 * via hammer2_inode_unlock() + hammer2_inode_drop().  However, if the caller
 * needs to resolve a hardlink it must ref/unlock/relock/drop the inode.
 *
 * The hammer2_inode structure regulates the interface between the high level
 * kernel VNOPS API and the filesystem backend (the chains).
 *
 * On return the inode is locked with the supplied cluster.
 */
hammer2_inode_t *
hammer2_inode_get(hammer2_pfs_t *pmp, hammer2_xop_head_t *xop,
		  hammer2_tid_t inum, int idx)
{
	hammer2_inode_t *nip;
	const hammer2_inode_data_t *iptmp;
	const hammer2_inode_data_t *nipdata;

	KKASSERT(xop == NULL ||
		 hammer2_cluster_type(&xop->cluster) ==
		 HAMMER2_BREF_TYPE_INODE);
	KKASSERT(pmp);

	/*
	 * Interlocked lookup/ref of the inode.  This code is only needed
	 * when looking up inodes with nlinks != 0 (TODO: optimize out
	 * otherwise and test for duplicates).
	 *
	 * Cluster can be NULL during the initial pfs allocation.
	 */
	if (xop) {
		iptmp = &hammer2_xop_gdata(xop)->ipdata;
		inum = iptmp->meta.inum;
		hammer2_xop_pdata(xop);
	}
again:
	nip = hammer2_inode_lookup(pmp, inum);
	if (nip) {
		/*
		 * We may have to unhold the cluster to avoid a deadlock
		 * against vnlru (and possibly other XOPs).
		 */
		if (xop) {
			if (hammer2_mtx_ex_try(&nip->lock) != 0) {
				hammer2_cluster_unhold(&xop->cluster);
				hammer2_mtx_ex(&nip->lock);
				hammer2_cluster_rehold(&xop->cluster);
			}
		} else {
			hammer2_mtx_ex(&nip->lock);
		}

		/*
		 * Handle SMP race (not applicable to the super-root spmp
		 * which can't index inodes due to duplicative inode numbers).
		 */
		if (pmp->spmp_hmp == NULL &&
		    (nip->flags & HAMMER2_INODE_ONRBTREE) == 0) {
			hammer2_mtx_unlock(&nip->lock);
			hammer2_inode_drop(nip);
			goto again;
		}
		if (xop) {
			if (idx >= 0)
				hammer2_inode_repoint_one(nip, &xop->cluster,
							  idx);
			else
				hammer2_inode_repoint(nip, NULL, &xop->cluster);
		}
		return nip;
	}

	/*
	 * We couldn't find the inode number, create a new inode and try to
	 * insert it, handle insertion races.
	 */
	nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO);
	spin_init(&nip->cluster_spin, "h2clspin");
	atomic_add_long(&pmp->inmem_inodes, 1);
	if (pmp->spmp_hmp)
		nip->flags = HAMMER2_INODE_SROOT;

	/*
	 * Initialize nip's cluster.  A cluster is provided for normal
	 * inodes but typically not for the super-root or PFS inodes.
	 */
	nip->cluster.refs = 1;
	nip->cluster.pmp = pmp;
	nip->cluster.flags |= HAMMER2_CLUSTER_INODE;
	if (xop) {
		nipdata = &hammer2_xop_gdata(xop)->ipdata;
		nip->meta = nipdata->meta;
		hammer2_xop_pdata(xop);
		atomic_set_int(&nip->flags, HAMMER2_INODE_METAGOOD);
		hammer2_inode_repoint(nip, NULL, &xop->cluster);
	} else {
		nip->meta.inum = inum;		/* PFS inum is always 1 XXX */
		/* mtime will be updated when a cluster is available */
		atomic_set_int(&nip->flags, HAMMER2_INODE_METAGOOD);	/*XXX*/
	}

	nip->pmp = pmp;

	/*
	 * ref and lock on nip gives it state compatible to after a
	 * hammer2_inode_lock() call.
	 */
	nip->refs = 1;
	hammer2_mtx_init(&nip->lock, "h2inode");
	hammer2_mtx_init(&nip->truncate_lock, "h2trunc");
	hammer2_mtx_ex(&nip->lock);
	TAILQ_INIT(&nip->depend_static.sideq);
	/* combination of thread lock and chain lock == inode lock */

	/*
	 * Attempt to add the inode.  If it fails we raced another inode
	 * get.  Undo all the work and try again.
	 */
	if (pmp->spmp_hmp == NULL) {
		hammer2_spin_ex(&pmp->inum_spin);
		if (RB_INSERT(hammer2_inode_tree, &pmp->inum_tree, nip)) {
			hammer2_spin_unex(&pmp->inum_spin);
			hammer2_mtx_unlock(&nip->lock);
			hammer2_inode_drop(nip);
			goto again;
		}
		atomic_set_int(&nip->flags, HAMMER2_INODE_ONRBTREE);
		++pmp->inum_count;
		hammer2_spin_unex(&pmp->inum_spin);
	}
	return (nip);
}
Beispiel #24
0
/*
 * Drop an inode reference, freeing the inode when the last reference goes
 * away.
 */
void
hammer2_inode_drop(hammer2_inode_t *ip)
{
	hammer2_pfsmount_t *pmp;
	hammer2_inode_t *pip;
	u_int refs;

	while (ip) {
		refs = ip->refs;
		cpu_ccfence();
		if (refs == 1) {
			/*
			 * Transition to zero, must interlock with
			 * the inode inumber lookup tree (if applicable).
			 *
			 * NOTE: The super-root inode has no pmp.
			 */
			pmp = ip->pmp;
			if (pmp)
				spin_lock(&pmp->inum_spin);

			if (atomic_cmpset_int(&ip->refs, 1, 0)) {
				KKASSERT(ip->topo_cst.count == 0);
				if (ip->flags & HAMMER2_INODE_ONRBTREE) {
					atomic_clear_int(&ip->flags,
						     HAMMER2_INODE_ONRBTREE);
					RB_REMOVE(hammer2_inode_tree,
						  &pmp->inum_tree, ip);
				}
				if (pmp)
					spin_unlock(&pmp->inum_spin);

				pip = ip->pip;
				ip->pip = NULL;
				ip->pmp = NULL;

				/*
				 * Cleaning out ip->chain isn't entirely
				 * trivial.
				 */
				hammer2_inode_repoint(ip, NULL, NULL);

				/*
				 * We have to drop pip (if non-NULL) to
				 * dispose of our implied reference from
				 * ip->pip.  We can simply loop on it.
				 */
				if (pmp) {
					KKASSERT((ip->flags &
						  HAMMER2_INODE_SROOT) == 0);
					kfree(ip, pmp->minode);
					atomic_add_long(&pmp->inmem_inodes, -1);
				} else {
					KKASSERT(ip->flags &
						 HAMMER2_INODE_SROOT);
					kfree(ip, M_HAMMER2);
				}
				ip = pip;
				/* continue with pip (can be NULL) */
			} else {
				if (pmp)
					spin_unlock(&ip->pmp->inum_spin);
			}
		} else {
			/*
			 * Non zero transition
			 */
			if (atomic_cmpset_int(&ip->refs, refs, refs - 1))
				break;
		}
	}
}
Beispiel #25
0
/*
 * balloon_replace_pages()
 *	Try to replace nextexts blocks of 2^order pages.  addr_bits specifies
 *	how many bits of address the pages must be within (i.e. 16 would mean
 *	that the pages cannot have an address > 64k).  The constrints are on
 *	what the hypervisor gives us -- we are free to give any pages in
 *	exchange.  The array pp is the pages we are giving away.  The caller
 *	provides storage space for mfns, which hold the new physical pages.
 */
long
balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits,
    uint_t order, mfn_t *mfns)
{
	xen_memory_reservation_t memres;
	long fallback_cnt;
	long cnt;
	uint_t i, j, page_cnt, extlen;
	long e;
	int locked;


	/*
	 * we shouldn't be allocating constrained pages on a guest. It doesn't
	 * make any sense. They won't be constrained after a migration.
	 */
	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));

	extlen = 1 << order;
	page_cnt = nextents * extlen;
	/* Give back the current pages to the hypervisor */
	for (i = 0; i < page_cnt; i++) {
		cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum);
		if (cnt != 1) {
			cmn_err(CE_PANIC, "balloon: unable to give a page back "
			    "to the hypervisor.\n");
		}
	}

	/*
	 * try to allocate the new pages using addr_bits and order. If we can't
	 * get all of the pages, try to get the remaining pages with no
	 * constraints and, if that was successful, return the number of
	 * constrained pages we did allocate.
	 */
	bzero(&memres, sizeof (memres));
	/*LINTED: constant in conditional context*/
	set_xen_guest_handle(memres.extent_start, mfns);
	memres.domid = DOMID_SELF;
	memres.nr_extents = nextents;
	memres.mem_flags = XENMEMF_address_bits(addr_bits);
	memres.extent_order = order;
	cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
	/* assign the new MFNs to the current PFNs */
	locked = balloon_lock_contig_pfnlist(cnt * extlen);
	for (i = 0; i < cnt; i++) {
		for (j = 0; j < extlen; j++) {
			reassign_pfn(pp[i * extlen + j]->p_pagenum,
			    mfns[i] + j);
		}
	}
	if (locked)
		unlock_contig_pfnlist();
	if (cnt != nextents) {
		if (cnt < 0) {
			cnt = 0;
		}

		/*
		 * We couldn't get enough memory to satisfy our requirements.
		 * The above loop will assign the parts of the request that
		 * were successful (this part may be 0).  We need to fill
		 * in the rest.  The bzero below clears out extent_order and
		 * address_bits, so we'll take anything from the hypervisor
		 * to replace the pages we gave away.
		 */
		fallback_cnt = page_cnt - cnt * extlen;
		bzero(&memres, sizeof (memres));
		/*LINTED: constant in conditional context*/
		set_xen_guest_handle(memres.extent_start, mfns);
		memres.domid = DOMID_SELF;
		memres.nr_extents = fallback_cnt;
		e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
		if (e != fallback_cnt) {
			cmn_err(CE_PANIC, "balloon: unable to recover from "
			    "failed increase_reservation.\n");
		}
		locked = balloon_lock_contig_pfnlist(fallback_cnt);
		for (i = 0; i < fallback_cnt; i++) {
			uint_t offset = page_cnt - fallback_cnt;

			/*
			 * We already used pp[0...(cnt * extlen)] before,
			 * so start at the next entry in the pp array.
			 */
			reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]);
		}
		if (locked)
			unlock_contig_pfnlist();
	}

	/*
	 * balloon_free_pages increments our counter.  Decrement it here.
	 */
	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt);

	/*
	 * return the number of extents we were able to replace. If we got
	 * this far, we know all the pp's are valid.
	 */
	return (cnt);
}
Beispiel #26
0
int
x86_emulate_cpuid(struct vm *vm, int vcpu_id,
                  uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
{
    const struct xsave_limits *limits;
    uint64_t cr4;
    int error, enable_invpcid;
    unsigned int 	func, regs[4];
    enum x2apic_state x2apic_state;

    /*
     * Requests for invalid CPUID levels should map to the highest
     * available level instead.
     */
    if (cpu_exthigh != 0 && *eax >= 0x80000000) {
        if (*eax > cpu_exthigh)
            *eax = cpu_exthigh;
    } else if (*eax >= 0x40000000) {
        if (*eax > CPUID_VM_HIGH)
            *eax = CPUID_VM_HIGH;
    } else if (*eax > cpu_high) {
        *eax = cpu_high;
    }

    func = *eax;

    /*
     * In general the approach used for CPU topology is to
     * advertise a flat topology where all CPUs are packages with
     * no multi-core or SMT.
     */
    switch (func) {
    /*
     * Pass these through to the guest
     */
    case CPUID_0000_0000:
    case CPUID_0000_0002:
    case CPUID_0000_0003:
    case CPUID_8000_0000:
    case CPUID_8000_0002:
    case CPUID_8000_0003:
    case CPUID_8000_0004:
    case CPUID_8000_0006:
    case CPUID_8000_0008:
        cpuid_count(*eax, *ecx, regs);
        break;

    case CPUID_8000_0001:
        /*
         * Hide rdtscp/ia32_tsc_aux until we know how
         * to deal with them.
         */
        cpuid_count(*eax, *ecx, regs);
        regs[3] &= ~AMDID_RDTSCP;
        break;

    case CPUID_8000_0007:
        cpuid_count(*eax, *ecx, regs);
        /*
         * If the host TSCs are not synchronized across
         * physical cpus then we cannot advertise an
         * invariant tsc to a vcpu.
         *
         * XXX This still falls short because the vcpu
         * can observe the TSC moving backwards as it
         * migrates across physical cpus. But at least
         * it should discourage the guest from using the
         * TSC to keep track of time.
         */
        if (!smp_tsc)
            regs[3] &= ~AMDPM_TSC_INVARIANT;
        break;

    case CPUID_0000_0001:
        do_cpuid(1, regs);

        error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
        if (error) {
            panic("x86_emulate_cpuid: error %d "
                  "fetching x2apic state", error);
        }

        /*
         * Override the APIC ID only in ebx
         */
        regs[1] &= ~(CPUID_LOCAL_APIC_ID);
        regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);

        /*
         * Don't expose VMX, SpeedStep or TME capability.
         * Advertise x2APIC capability and Hypervisor guest.
         */
        regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);

        regs[2] |= CPUID2_HV;

        if (x2apic_state != X2APIC_DISABLED)
            regs[2] |= CPUID2_X2APIC;
        else
            regs[2] &= ~CPUID2_X2APIC;

        /*
         * Only advertise CPUID2_XSAVE in the guest if
         * the host is using XSAVE.
         */
        if (!(regs[2] & CPUID2_OSXSAVE))
            regs[2] &= ~CPUID2_XSAVE;

        /*
         * If CPUID2_XSAVE is being advertised and the
         * guest has set CR4_XSAVE, set
         * CPUID2_OSXSAVE.
         */
        regs[2] &= ~CPUID2_OSXSAVE;
        if (regs[2] & CPUID2_XSAVE) {
            error = vm_get_register(vm, vcpu_id,
                                    VM_REG_GUEST_CR4, &cr4);
            if (error)
                panic("x86_emulate_cpuid: error %d "
                      "fetching %%cr4", error);
            if (cr4 & CR4_XSAVE)
                regs[2] |= CPUID2_OSXSAVE;
        }

        /*
         * Hide monitor/mwait until we know how to deal with
         * these instructions.
         */
        regs[2] &= ~CPUID2_MON;

        /*
        * Hide the performance and debug features.
        		 */
        regs[2] &= ~CPUID2_PDCM;

        /*
         * No TSC deadline support in the APIC yet
         */
        regs[2] &= ~CPUID2_TSCDLT;

        /*
         * Hide thermal monitoring
         */
        regs[3] &= ~(CPUID_ACPI | CPUID_TM);

        /*
         * Machine check handling is done in the host.
         * Hide MTRR capability.
         */
        regs[3] &= ~(CPUID_MCA | CPUID_MCE | CPUID_MTRR);

        /*
        * Hide the debug store capability.
        */
        regs[3] &= ~CPUID_DS;

        /*
         * Disable multi-core.
         */
        regs[1] &= ~CPUID_HTT_CORES;
        regs[3] &= ~CPUID_HTT;
        break;

    case CPUID_0000_0004:
        do_cpuid(4, regs);

        /*
         * Do not expose topology.
         */
        regs[0] &= 0xffff8000;
        regs[0] |= 0x04008000;
        break;

    case CPUID_0000_0007:
        regs[0] = 0;
        regs[1] = 0;
        regs[2] = 0;
        regs[3] = 0;

        /* leaf 0 */
        if (*ecx == 0) {
            error = vm_get_capability(vm, vcpu_id,
                                      VM_CAP_ENABLE_INVPCID, &enable_invpcid);
            if (error == 0 && enable_invpcid)
                regs[1] |= CPUID_STDEXT_INVPCID;
        }
        break;

    case CPUID_0000_0006:
    case CPUID_0000_000A:
        /*
         * Handle the access, but report 0 for
         * all options
         */
        regs[0] = 0;
        regs[1] = 0;
        regs[2] = 0;
        regs[3] = 0;
        break;

    case CPUID_0000_000B:
        /*
         * Processor topology enumeration
         */
        regs[0] = 0;
        regs[1] = 0;
        regs[2] = *ecx & 0xff;
        regs[3] = vcpu_id;
        break;

    case CPUID_0000_000D:
        limits = vmm_get_xsave_limits();
        if (!limits->xsave_enabled) {
            regs[0] = 0;
            regs[1] = 0;
            regs[2] = 0;
            regs[3] = 0;
            break;
        }

        cpuid_count(*eax, *ecx, regs);
        switch (*ecx) {
        case 0:
            /*
             * Only permit the guest to use bits
             * that are active in the host in
             * %xcr0.  Also, claim that the
             * maximum save area size is
             * equivalent to the host's current
             * save area size.  Since this runs
             * "inside" of vmrun(), it runs with
             * the guest's xcr0, so the current
             * save area size is correct as-is.
             */
            regs[0] &= limits->xcr0_allowed;
            regs[2] = limits->xsave_max_size;
            regs[3] &= (limits->xcr0_allowed >> 32);
            break;
        case 1:
            /* Only permit XSAVEOPT. */
            regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
            regs[1] = 0;
            regs[2] = 0;
            regs[3] = 0;
            break;
        default:
            /*
             * If the leaf is for a permitted feature,
             * pass through as-is, otherwise return
             * all zeroes.
             */
            if (!(limits->xcr0_allowed & (1ul << *ecx))) {
                regs[0] = 0;
                regs[1] = 0;
                regs[2] = 0;
                regs[3] = 0;
            }
            break;
        }
        break;

    case 0x40000000:
        regs[0] = CPUID_VM_HIGH;
        bcopy(bhyve_id, &regs[1], 4);
        bcopy(bhyve_id + 4, &regs[2], 4);
        bcopy(bhyve_id + 8, &regs[3], 4);
        break;

    default:
        /*
         * The leaf value has already been clamped so
         * simply pass this through, keeping count of
         * how many unhandled leaf values have been seen.
         */
        atomic_add_long(&bhyve_xcpuids, 1);
        cpuid_count(*eax, *ecx, regs);
        break;
    }

    *eax = regs[0];
    *ebx = regs[1];
    *ecx = regs[2];
    *edx = regs[3];

    return (1);
}
Beispiel #27
0
int
px_fdvma_reserve(dev_info_t *dip, dev_info_t *rdip, px_t *px_p,
	ddi_dma_req_t *dmareq, ddi_dma_handle_t *handlep)
{
	fdvma_t *fdvma_p;
	px_dvma_addr_t dvma_pg;
	px_mmu_t *mmu_p = px_p->px_mmu_p;
	size_t npages;
	ddi_dma_impl_t *mp;
	ddi_dma_lim_t *lim_p = dmareq->dmar_limits;
	ulong_t hi = lim_p->dlim_addr_hi;
	ulong_t lo = lim_p->dlim_addr_lo;
	size_t counter_max = (lim_p->dlim_cntr_max + 1) & MMU_PAGE_MASK;

	if (px_disable_fdvma)
		return (DDI_FAILURE);

	DBG(DBG_DMA_CTL, dip, "DDI_DMA_RESERVE: rdip=%s%d\n",
	    ddi_driver_name(rdip), ddi_get_instance(rdip));

	/*
	 * Check the limit structure.
	 */
	if ((lo >= hi) || (hi < mmu_p->mmu_dvma_base))
		return (DDI_DMA_BADLIMITS);

	/*
	 * Allocate DVMA space from reserve.
	 */
	npages = dmareq->dmar_object.dmao_size;
	if ((long)atomic_add_long_nv(&mmu_p->mmu_dvma_reserve, -npages) < 0) {
		atomic_add_long(&mmu_p->mmu_dvma_reserve, npages);
		return (DDI_DMA_NORESOURCES);
	}

	/*
	 * Allocate the dma handle.
	 */
	mp = kmem_zalloc(sizeof (px_dma_hdl_t), KM_SLEEP);

	/*
	 * Get entries from dvma space map.
	 * (vmem_t *vmp,
	 *	size_t size, size_t align, size_t phase,
	 *	size_t nocross, void *minaddr, void *maxaddr, int vmflag)
	 */
	dvma_pg = MMU_BTOP((ulong_t)vmem_xalloc(mmu_p->mmu_dvma_map,
	    MMU_PTOB(npages), MMU_PAGE_SIZE, 0,
	    counter_max, (void *)lo, (void *)(hi + 1),
	    dmareq->dmar_fp == DDI_DMA_SLEEP ? VM_SLEEP : VM_NOSLEEP));
	if (dvma_pg == 0) {
		atomic_add_long(&mmu_p->mmu_dvma_reserve, npages);
		kmem_free(mp, sizeof (px_dma_hdl_t));
		return (DDI_DMA_NOMAPPING);
	}

	/*
	 * Create the fast dvma request structure.
	 */
	fdvma_p = kmem_alloc(sizeof (fdvma_t), KM_SLEEP);
	fdvma_p->pagecnt = kmem_alloc(npages * sizeof (uint_t), KM_SLEEP);
	fdvma_p->ops = &fdvma_ops;
	fdvma_p->softsp = (caddr_t)px_p;
	fdvma_p->sync_flag = NULL;

	/*
	 * Initialize the handle.
	 */
	mp->dmai_rdip = rdip;
	mp->dmai_rflags = DMP_BYPASSNEXUS | DDI_DMA_READ | DMP_NOSYNC;
	mp->dmai_burstsizes = dmareq->dmar_limits->dlim_burstsizes;
	mp->dmai_mapping = MMU_PTOB(dvma_pg);
	mp->dmai_ndvmapages = npages;
	mp->dmai_size = npages * MMU_PAGE_SIZE;
	mp->dmai_nwin = 0;
	mp->dmai_fdvma = (caddr_t)fdvma_p;

	/*
	 * The bdf protection value is set to immediate child
	 * at first. It gets modified by switch/bridge drivers
	 * as the code traverses down the fabric topology.
	 *
	 * XXX No IOMMU protection for broken devices.
	 */
	ASSERT((intptr_t)ddi_get_parent_data(rdip) >> 1 == 0);
	mp->dmai_bdf = ((intptr_t)ddi_get_parent_data(rdip) == 1) ?
	    PCIE_INVALID_BDF : pcie_get_bdf_for_dma_xfer(dip, rdip);

	DBG(DBG_DMA_CTL, dip,
	    "DDI_DMA_RESERVE: mp=%p dvma=%x npages=%x private=%p\n",
	    mp, mp->dmai_mapping, npages, fdvma_p);
	*handlep = (ddi_dma_handle_t)mp;
	return (DDI_SUCCESS);
}
Beispiel #28
0
/*
 * balloon_free_pages()
 *    free page_cnt pages, using any combination of mfns, pfns, and kva as long
 *    as they refer to the same mapping.  If an array of mfns is passed in, we
 *    assume they were already cleared.  Otherwise, we need to zero the pages
 *    before giving them back to the hypervisor. kva space is not free'd up in
 *    case the caller wants to re-use it.
 */
long
balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns)
{
	xen_memory_reservation_t memdec;
	mfn_t mfn;
	pfn_t pfn;
	uint_t i;
	long e;


#if DEBUG
	/* make sure kva is page aligned and maps to first pfn */
	if (kva != NULL) {
		ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0);
		if (pfns != NULL) {
			ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]);
		}
	}
#endif

	/* if we have a kva, we can clean all pages with just one bzero */
	if ((kva != NULL) && balloon_zero_memory) {
		bzero(kva, (page_cnt * PAGESIZE));
	}

	/* if we were given a kva and/or a pfn */
	if ((kva != NULL) || (pfns != NULL)) {

		/*
		 * All the current callers only pass 1 page when using kva or
		 * pfns, and use mfns when passing multiple pages.  If that
		 * assumption is changed, the following code will need some
		 * work.  The following ASSERT() guarantees we're respecting
		 * the io locking quota.
		 */
		ASSERT(page_cnt < bln_contig_list_quota);

		/* go through all the pages */
		for (i = 0; i < page_cnt; i++) {

			/* get the next pfn */
			if (pfns == NULL) {
				pfn = hat_getpfnum(kas.a_hat,
				    (kva + (PAGESIZE * i)));
			} else {
				pfn = pfns[i];
			}

			/*
			 * if we didn't already zero this page, do it now. we
			 * need to do this *before* we give back the MFN
			 */
			if ((kva == NULL) && (balloon_zero_memory)) {
				pfnzero(pfn, 0, PAGESIZE);
			}

			/*
			 * unmap the pfn. We don't free up the kva vmem space
			 * so the caller can re-use it. The page must be
			 * unmapped before it is given back to the hypervisor.
			 */
			if (kva != NULL) {
				hat_unload(kas.a_hat, (kva + (PAGESIZE * i)),
				    PAGESIZE, HAT_UNLOAD_UNMAP);
			}

			/* grab the mfn before the pfn is marked as invalid */
			mfn = pfn_to_mfn(pfn);

			/* mark the pfn as invalid */
			reassign_pfn(pfn, MFN_INVALID);

			/*
			 * if we weren't given an array of MFNs, we need to
			 * free them up one at a time. Otherwise, we'll wait
			 * until later and do it in one hypercall
			 */
			if (mfns == NULL) {
				bzero(&memdec, sizeof (memdec));
				/*LINTED: constant in conditional context*/
				set_xen_guest_handle(memdec.extent_start, &mfn);
				memdec.domid = DOMID_SELF;
				memdec.nr_extents = 1;
				e = HYPERVISOR_memory_op(
				    XENMEM_decrease_reservation, &memdec);
				if (e != 1) {
					cmn_err(CE_PANIC, "balloon: unable to "
					    "give a page back to the "
					    "hypervisor.\n");
				}
			}
		}
	}

	/*
	 * if we were passed in MFNs, we haven't free'd them up yet. We can
	 * do it with one call.
	 */
	if (mfns != NULL) {
		bzero(&memdec, sizeof (memdec));
		/*LINTED: constant in conditional context*/
		set_xen_guest_handle(memdec.extent_start, mfns);
		memdec.domid = DOMID_SELF;
		memdec.nr_extents = page_cnt;
		e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec);
		if (e != page_cnt) {
			cmn_err(CE_PANIC, "balloon: unable to give pages back "
			    "to the hypervisor.\n");
		}
	}

	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt);
	return (page_cnt);
}
Beispiel #29
0
/*
 * This is now called from local media FS's to operate against their
 * own vnodes if they fail to implement VOP_GETPAGES.
 */
int
vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int bytecount,
    int reqpage, vop_getpages_iodone_t iodone, void *arg)
{
	vm_object_t object;
	struct bufobj *bo;
	struct buf *bp;
	daddr_t firstaddr, reqblock;
	off_t foff, pib;
	int pbefore, pafter, i, size, bsize, first, last, *freecnt;
	int count, error, before, after, secmask;

	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
	    ("vnode_pager_generic_getpages does not support devices"));
	if (vp->v_iflag & VI_DOOMED)
		return (VM_PAGER_BAD);

	object = vp->v_object;
	count = bytecount / PAGE_SIZE;
	bsize = vp->v_mount->mnt_stat.f_iosize;

	/*
	 * Synchronous and asynchronous paging operations use different
	 * free pbuf counters.  This is done to avoid asynchronous requests
	 * to consume all pbufs.
	 * Allocate the pbuf at the very beginning of the function, so that
	 * if we are low on certain kind of pbufs don't even proceed to BMAP,
	 * but sleep.
	 */
	freecnt = iodone != NULL ?
	    &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt;
	bp = getpbuf(freecnt);

	/*
	 * Get the underlying device blocks for the file with VOP_BMAP().
	 * If the file system doesn't support VOP_BMAP, use old way of
	 * getting pages via VOP_READ.
	 */
	error = VOP_BMAP(vp, IDX_TO_OFF(m[reqpage]->pindex) / bsize, &bo,
	    &reqblock, &after, &before);
	if (error == EOPNOTSUPP) {
		relpbuf(bp, freecnt);
		VM_OBJECT_WLOCK(object);
		for (i = 0; i < count; i++)
			if (i != reqpage) {
				vm_page_lock(m[i]);
				vm_page_free(m[i]);
				vm_page_unlock(m[i]);
			}
		PCPU_INC(cnt.v_vnodein);
		PCPU_INC(cnt.v_vnodepgsin);
		error = vnode_pager_input_old(object, m[reqpage]);
		VM_OBJECT_WUNLOCK(object);
		return (error);
	} else if (error != 0) {
		relpbuf(bp, freecnt);
		vm_pager_free_nonreq(object, m, reqpage, count, FALSE);
		return (VM_PAGER_ERROR);

		/*
		 * If the blocksize is smaller than a page size, then use
		 * special small filesystem code.
		 */
	} else if ((PAGE_SIZE / bsize) > 1) {
		relpbuf(bp, freecnt);
		vm_pager_free_nonreq(object, m, reqpage, count, FALSE);
		PCPU_INC(cnt.v_vnodein);
		PCPU_INC(cnt.v_vnodepgsin);
		return (vnode_pager_input_smlfs(object, m[reqpage]));
	}

	/*
	 * Since the caller has busied the requested page, that page's valid
	 * field will not be changed by other threads.
	 */
	vm_page_assert_xbusied(m[reqpage]);

	/*
	 * If we have a completely valid page available to us, we can
	 * clean up and return.  Otherwise we have to re-read the
	 * media.
	 */
	if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
		relpbuf(bp, freecnt);
		vm_pager_free_nonreq(object, m, reqpage, count, FALSE);
		return (VM_PAGER_OK);
	} else if (reqblock == -1) {
		relpbuf(bp, freecnt);
		pmap_zero_page(m[reqpage]);
		KASSERT(m[reqpage]->dirty == 0,
		    ("vnode_pager_generic_getpages: page %p is dirty", m));
		VM_OBJECT_WLOCK(object);
		m[reqpage]->valid = VM_PAGE_BITS_ALL;
		vm_pager_free_nonreq(object, m, reqpage, count, TRUE);
		VM_OBJECT_WUNLOCK(object);
		return (VM_PAGER_OK);
	} else if (m[reqpage]->valid != 0) {
		VM_OBJECT_WLOCK(object);
		m[reqpage]->valid = 0;
		VM_OBJECT_WUNLOCK(object);
	}

	pib = IDX_TO_OFF(m[reqpage]->pindex) % bsize;
	pbefore = ((daddr_t)before * bsize + pib) / PAGE_SIZE;
	pafter = ((daddr_t)(after + 1) * bsize - pib) / PAGE_SIZE - 1;
	first = reqpage < pbefore ? 0 : reqpage - pbefore;
	last = reqpage + pafter >= count ? count - 1 : reqpage + pafter;
	if (first > 0 || last + 1 < count) {
		VM_OBJECT_WLOCK(object);
		for (i = 0; i < first; i++) {
			vm_page_lock(m[i]);
			vm_page_free(m[i]);
			vm_page_unlock(m[i]);
		}
		for (i = last + 1; i < count; i++) {
			vm_page_lock(m[i]);
			vm_page_free(m[i]);
			vm_page_unlock(m[i]);
		}
		VM_OBJECT_WUNLOCK(object);
	}

	/*
	 * here on direct device I/O
	 */
	firstaddr = reqblock;
	firstaddr += pib / DEV_BSIZE;
	firstaddr -= IDX_TO_OFF(reqpage - first) / DEV_BSIZE;

	/*
	 * The first and last page have been calculated now, move
	 * input pages to be zero based, and adjust the count.
	 */
	m += first;
	reqpage -= first;
	count = last - first + 1;

	/*
	 * calculate the file virtual address for the transfer
	 */
	foff = IDX_TO_OFF(m[0]->pindex);

	/*
	 * calculate the size of the transfer
	 */
	size = count * PAGE_SIZE;
	KASSERT(count > 0, ("zero count"));
	if ((foff + size) > object->un_pager.vnp.vnp_size)
		size = object->un_pager.vnp.vnp_size - foff;
	KASSERT(size > 0, ("zero size"));

	/*
	 * round up physical size for real devices.
	 */
	secmask = bo->bo_bsize - 1;
	KASSERT(secmask < PAGE_SIZE && secmask > 0,
	    ("vnode_pager_generic_getpages: sector size %d too large",
	    secmask + 1));
	size = (size + secmask) & ~secmask;

	/*
	 * and map the pages to be read into the kva, if the filesystem
	 * requires mapped buffers.
	 */
	if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
	    unmapped_buf_allowed) {
		bp->b_data = unmapped_buf;
		bp->b_offset = 0;
	} else {
		bp->b_data = bp->b_kvabase;
		pmap_qenter((vm_offset_t)bp->b_data, m, count);
	}

	/* build a minimal buffer header */
	bp->b_iocmd = BIO_READ;
	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
	bp->b_rcred = crhold(curthread->td_ucred);
	bp->b_wcred = crhold(curthread->td_ucred);
	bp->b_blkno = firstaddr;
	pbgetbo(bo, bp);
	bp->b_vp = vp;
	bp->b_bcount = size;
	bp->b_bufsize = size;
	bp->b_runningbufspace = bp->b_bufsize;
	for (i = 0; i < count; i++)
		bp->b_pages[i] = m[i];
	bp->b_npages = count;
	bp->b_pager.pg_reqpage = reqpage;
	atomic_add_long(&runningbufspace, bp->b_runningbufspace);

	PCPU_INC(cnt.v_vnodein);
	PCPU_ADD(cnt.v_vnodepgsin, count);

	/* do the input */
	bp->b_iooffset = dbtob(bp->b_blkno);

	if (iodone != NULL) { /* async */
		bp->b_pager.pg_iodone = iodone;
		bp->b_caller1 = arg;
		bp->b_iodone = vnode_pager_generic_getpages_done_async;
		bp->b_flags |= B_ASYNC;
		BUF_KERNPROC(bp);
		bstrategy(bp);
		/* Good bye! */
	} else {
		bp->b_iodone = bdone;
		bstrategy(bp);
		bwait(bp, PVM, "vnread");
		error = vnode_pager_generic_getpages_done(bp);
		for (i = 0; i < bp->b_npages; i++)
			bp->b_pages[i] = NULL;
		bp->b_vp = NULL;
		pbrelbo(bp);
		relpbuf(bp, &vnode_pbuf_freecnt);
	}

	return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
}