Example #1
0
/*
 * Update cache contents upon write completion.
 */
void
vdev_cache_write(zio_t *zio)
{
	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	vdev_cache_entry_t *ve, ve_search;
	uint64_t io_start = zio->io_offset;
	uint64_t io_end = io_start + zio->io_size;
	uint64_t min_offset = P2ALIGN(io_start, VCBS);
	uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
	avl_index_t where;

	ASSERT(zio->io_type == ZIO_TYPE_WRITE);

	mutex_enter(&vc->vc_lock);

	ve_search.ve_offset = min_offset;
	ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);

	if (ve == NULL)
		ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);

	while (ve != NULL && ve->ve_offset < max_offset) {
		uint64_t start = MAX(ve->ve_offset, io_start);
		uint64_t end = MIN(ve->ve_offset + VCBS, io_end);

		if (ve->ve_fill_io != NULL) {
			ve->ve_missed_update = 1;
		} else {
			bcopy((char *)zio->io_data + start - io_start,
			    ve->ve_data + start - ve->ve_offset, end - start);
		}
		ve = AVL_NEXT(&vc->vc_offset_tree, ve);
	}
	mutex_exit(&vc->vc_lock);
}
Example #2
0
static caddr_t
pci_cfgacc_map(paddr_t phys_addr)
{
#ifdef __xpv
	phys_addr = pfn_to_pa(xen_assign_pfn(mmu_btop(phys_addr))) |
	    (phys_addr & MMU_PAGEOFFSET);
#endif
	if (khat_running) {
		pfn_t pfn = mmu_btop(phys_addr);
		/*
		 * pci_cfgacc_virt_base may hold address left from early
		 * boot, which points to low mem. Realloc virtual address
		 * in kernel space since it's already late in boot now.
		 * Note: no need to unmap first, clear_boot_mappings() will
		 * do that for us.
		 */
		if (pci_cfgacc_virt_base < (caddr_t)kernelbase)
			pci_cfgacc_virt_base = vmem_alloc(heap_arena,
			    MMU_PAGESIZE, VM_SLEEP);

		hat_devload(kas.a_hat, pci_cfgacc_virt_base,
		    MMU_PAGESIZE, pfn, PROT_READ | PROT_WRITE |
		    HAT_STRICTORDER, HAT_LOAD_LOCK);
	} else {
		paddr_t	pa_base = P2ALIGN(phys_addr, MMU_PAGESIZE);

		if (pci_cfgacc_virt_base == NULL)
			pci_cfgacc_virt_base =
			    (caddr_t)alloc_vaddr(MMU_PAGESIZE, MMU_PAGESIZE);

		kbm_map((uintptr_t)pci_cfgacc_virt_base, pa_base, 0, 0);
	}

	return (pci_cfgacc_virt_base + (phys_addr & MMU_PAGEOFFSET));
}
Example #3
0
/*
 * Allocate an entry in the cache.  At the point we don't have the data,
 * we're just creating a placeholder so that multiple threads don't all
 * go off and read the same blocks.
 */
static vdev_cache_entry_t *
vdev_cache_allocate(zio_t *zio)
{
	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
	vdev_cache_entry_t *ve;

	ASSERT(MUTEX_HELD(&vc->vc_lock));

	if (zfs_vdev_cache_size == 0)
		return (NULL);

	/*
	 * If adding a new entry would exceed the cache size,
	 * evict the oldest entry (LRU).
	 */
	if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
	    zfs_vdev_cache_size) {
		ve = avl_first(&vc->vc_lastused_tree);
		if (ve->ve_fill_io != NULL)
			return (NULL);
		ASSERT(ve->ve_hits != 0);
		vdev_cache_evict(vc, ve);
	}

	ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
	ve->ve_offset = offset;
	ve->ve_lastused = lbolt;
	ve->ve_data = zio_buf_alloc(VCBS);

	avl_add(&vc->vc_offset_tree, ve);
	avl_add(&vc->vc_lastused_tree, ve);

	return (ve);
}
Example #4
0
/*
 * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
 */
int
vdev_cache_read(zio_t *zio)
{
	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	vdev_cache_entry_t *ve, *ve_search;
	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
	ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);)
/*ARGSUSED*/
int
plat_get_mem_unum(int synd_code, uint64_t flt_addr, int flt_bus_id,
    int flt_in_memory, ushort_t flt_status, char *buf, int buflen, int *lenp)
{
	if (flt_in_memory && (p2get_mem_unum != NULL))
		return (p2get_mem_unum(synd_code, P2ALIGN(flt_addr, 8),
		    buf, buflen, lenp));
	else
		return (ENOTSUP);
}
Example #6
0
/*
 * Check whether any portion of [start, end] segment is within the
 * [start_addr, end_addr] range.
 *
 * Return values:
 *   0 - address is outside the range
 *   1 - address is within the range
 */
static int
address_in_range(uintptr_t start, uintptr_t end, size_t psz)
{
    int rc = 1;

    /*
     *  Nothing to do if there is no address range specified with -A
     */
    if (start_addr != INVALID_ADDRESS || end_addr != INVALID_ADDRESS) {
        /* The segment end is below the range start */
        if ((start_addr != INVALID_ADDRESS) &&
                (end < P2ALIGN(start_addr, psz)))
            rc = 0;

        /* The segment start is above the range end */
        if ((end_addr != INVALID_ADDRESS) &&
                (start > P2ALIGN(end_addr + psz, psz)))
            rc = 0;
    }
    return (rc);
}
Example #7
0
File: zvol.c Project: Oliverlyn/zfs
static int
zvol_discard(struct bio *bio)
{
	zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
	uint64_t start = BIO_BI_SECTOR(bio) << 9;
	uint64_t size = BIO_BI_SIZE(bio);
	uint64_t end = start + size;
	int error;
	rl_t *rl;
	dmu_tx_t *tx;

	ASSERT(zv && zv->zv_open_count > 0);

	if (end > zv->zv_volsize)
		return (SET_ERROR(EIO));

	/*
	 * Align the request to volume block boundaries when REQ_SECURE is
	 * available, but not requested. If we don't, then this will force
	 * dnode_free_range() to zero out the unaligned parts, which is slow
	 * (read-modify-write) and useless since we are not freeing any space
	 * by doing so. Kernels that do not support REQ_SECURE (2.6.32 through
	 * 2.6.35) will not receive this optimization.
	 */
#ifdef REQ_SECURE
	if (!(bio->bi_rw & REQ_SECURE)) {
		start = P2ROUNDUP(start, zv->zv_volblocksize);
		end = P2ALIGN(end, zv->zv_volblocksize);
		size = end - start;
	}
#endif

	if (start >= end)
		return (0);

	rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER);
	tx = dmu_tx_create(zv->zv_objset);
	dmu_tx_mark_netfree(tx);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error != 0) {
		dmu_tx_abort(tx);
	} else {
		zvol_log_truncate(zv, tx, start, size, B_TRUE);
		dmu_tx_commit(tx);
		error = dmu_free_long_range(zv->zv_objset,
		    ZVOL_OBJ, start, size);
	}

	zfs_range_unlock(rl);

	return (error);
}
Example #8
0
static void
zvol_discard(void *arg)
{
	struct request *req = (struct request *)arg;
	struct request_queue *q = req->q;
	zvol_state_t *zv = q->queuedata;
	uint64_t start = blk_rq_pos(req) << 9;
	uint64_t end = start + blk_rq_bytes(req);
	int error;
	rl_t *rl;

	/*
	 * Annotate this call path with a flag that indicates that it is
	 * unsafe to use KM_SLEEP during memory allocations due to the
	 * potential for a deadlock.  KM_PUSHPAGE should be used instead.
	 */
	ASSERT(!(current->flags & PF_NOFS));
	current->flags |= PF_NOFS;

	if (end > zv->zv_volsize) {
		blk_end_request(req, -EIO, blk_rq_bytes(req));
		goto out;
	}

	/*
	 * Align the request to volume block boundaries. If we don't,
	 * then this will force dnode_free_range() to zero out the
	 * unaligned parts, which is slow (read-modify-write) and
	 * useless since we are not freeing any space by doing so.
	 */
	start = P2ROUNDUP(start, zv->zv_volblocksize);
	end = P2ALIGN(end, zv->zv_volblocksize);

	if (start >= end) {
		blk_end_request(req, 0, blk_rq_bytes(req));
		goto out;
	}

	rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER);

	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end - start);

	/*
	 * TODO: maybe we should add the operation to the log.
	 */

	zfs_range_unlock(rl);

	blk_end_request(req, -error, blk_rq_bytes(req));
out:
	current->flags &= ~PF_NOFS;
}
Example #9
0
File: zvol.c Project: alek-p/zfs
static int
zvol_discard(struct bio *bio)
{
	zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
	uint64_t start = BIO_BI_SECTOR(bio) << 9;
	uint64_t size = BIO_BI_SIZE(bio);
	uint64_t end = start + size;
	int error;
	rl_t *rl;
	dmu_tx_t *tx;

	ASSERT(zv && zv->zv_open_count > 0);

	if (end > zv->zv_volsize)
		return (SET_ERROR(EIO));

	/*
	 * Align the request to volume block boundaries when a secure erase is
	 * not required.  This will prevent dnode_free_range() from zeroing out
	 * the unaligned parts which is slow (read-modify-write) and useless
	 * since we are not freeing any space by doing so.
	 */
	if (!bio_is_secure_erase(bio)) {
		start = P2ROUNDUP(start, zv->zv_volblocksize);
		end = P2ALIGN(end, zv->zv_volblocksize);
		size = end - start;
	}

	if (start >= end)
		return (0);

	rl = zfs_range_lock(&zv->zv_range_lock, start, size, RL_WRITER);
	tx = dmu_tx_create(zv->zv_objset);
	dmu_tx_mark_netfree(tx);
	error = dmu_tx_assign(tx, TXG_WAIT);
	if (error != 0) {
		dmu_tx_abort(tx);
	} else {
		zvol_log_truncate(zv, tx, start, size, B_TRUE);
		dmu_tx_commit(tx);
		error = dmu_free_long_range(zv->zv_objset,
		    ZVOL_OBJ, start, size);
	}

	zfs_range_unlock(rl);

	return (error);
}
Example #10
0
/*
 * Copy in a memory list from boot to kernel, with a filter function
 * to remove pages. The filter function can increase the address and/or
 * decrease the size to filter out pages.  It will also align addresses and
 * sizes to PAGESIZE.
 */
void
copy_memlist_filter(
	struct memlist *src,
	struct memlist **dstp,
	void (*filter)(uint64_t *, uint64_t *))
{
	struct memlist *dst, *prev;
	uint64_t addr;
	uint64_t size;
	uint64_t eaddr;

	dst = *dstp;
	prev = dst;

	/*
	 * Move through the memlist applying a filter against
	 * each range of memory. Note that we may apply the
	 * filter multiple times against each memlist entry.
	 */
	for (; src; src = src->ml_next) {
		addr = P2ROUNDUP(src->ml_address, PAGESIZE);
		eaddr = P2ALIGN(src->ml_address + src->ml_size, PAGESIZE);
		while (addr < eaddr) {
			size = eaddr - addr;
			if (filter != NULL)
				filter(&addr, &size);
			if (size == 0)
				break;
			dst->ml_address = addr;
			dst->ml_size = size;
			dst->ml_next = 0;
			if (prev == dst) {
				dst->ml_prev = 0;
				dst++;
			} else {
				dst->ml_prev = prev;
				prev->ml_next = dst;
				dst++;
				prev++;
			}
			addr += size;
		}
	}

	*dstp = dst;
}
Example #11
0
File: zvol.c Project: avg-I/zfs
static void
zvol_discard(void *arg)
{
    struct request *req = (struct request *)arg;
    struct request_queue *q = req->q;
    zvol_state_t *zv = q->queuedata;
    fstrans_cookie_t cookie = spl_fstrans_mark();
    uint64_t start = blk_rq_pos(req) << 9;
    uint64_t end = start + blk_rq_bytes(req);
    int error;
    rl_t *rl;

    if (end > zv->zv_volsize) {
        error = EIO;
        goto out;
    }

    /*
     * Align the request to volume block boundaries. If we don't,
     * then this will force dnode_free_range() to zero out the
     * unaligned parts, which is slow (read-modify-write) and
     * useless since we are not freeing any space by doing so.
     */
    start = P2ROUNDUP(start, zv->zv_volblocksize);
    end = P2ALIGN(end, zv->zv_volblocksize);

    if (start >= end) {
        error = 0;
        goto out;
    }

    rl = zfs_range_lock(&zv->zv_znode, start, end - start, RL_WRITER);

    error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end-start);

    /*
     * TODO: maybe we should add the operation to the log.
     */

    zfs_range_unlock(rl);
out:
    blk_end_request(req, -error, blk_rq_bytes(req));
    spl_fstrans_unmark(cookie);
}
Example #12
0
File: zvol.c Project: koplover/zfs
static int
zvol_discard(struct bio *bio)
{
	zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
	uint64_t start = BIO_BI_SECTOR(bio) << 9;
	uint64_t size = BIO_BI_SIZE(bio);
	uint64_t end = start + size;
	int error;
	rl_t *rl;

	if (end > zv->zv_volsize)
		return (SET_ERROR(EIO));

	/*
	 * Align the request to volume block boundaries when REQ_SECURE is
	 * available, but not requested. If we don't, then this will force
	 * dnode_free_range() to zero out the unaligned parts, which is slow
	 * (read-modify-write) and useless since we are not freeing any space
	 * by doing so. Kernels that do not support REQ_SECURE (2.6.32 through
	 * 2.6.35) will not receive this optimization.
	 */
#ifdef REQ_SECURE
	if (!(bio->bi_rw & REQ_SECURE)) {
		start = P2ROUNDUP(start, zv->zv_volblocksize);
		end = P2ALIGN(end, zv->zv_volblocksize);
		size = end - start;
	}
#endif

	if (start >= end)
		return (0);

	rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER);

	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size);

	/*
	 * TODO: maybe we should add the operation to the log.
	 */

	zfs_range_unlock(rl);

	return (error);
}
Example #13
0
void
fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
{
	const fletcher_4_ops_t *ops;
	uint64_t p2size = P2ALIGN(size, 64);

	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));

	if (size == 0) {
		ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
	} else if (p2size == 0) {
		ops = &fletcher_4_scalar_ops;
		fletcher_4_byteswap_impl(ops, buf, size, zcp);
	} else {
		ops = fletcher_4_impl_get();
		fletcher_4_byteswap_impl(ops, buf, p2size, zcp);

		if (p2size < size)
			fletcher_4_incremental_byteswap((char *)buf + p2size,
			    size - p2size, zcp);
	}
}
Example #14
0
/*ARGSUSED*/
void
fletcher_4_byteswap(const void *buf, uint64_t size,
    const void *ctx_template, zio_cksum_t *zcp)
{
	const uint64_t p2size = P2ALIGN(size, 64);

	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));

	if (size == 0 || p2size == 0) {
		ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);

		if (size > 0)
			fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
			    buf, size);
	} else {
		fletcher_4_byteswap_impl(buf, p2size, zcp);

		if (p2size < size)
			fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
			    (char *)buf + p2size, size - p2size);
	}
}
Example #15
0
/*
 * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
 * to take a held dnode rather than <os, object> -- the lookup is wasteful,
 * and can induce severe lock contention when writing to several files
 * whose dnodes are in the same block.
 */
static int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
    int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
{
	dmu_buf_t **dbp;
	uint64_t blkid, nblks, i;
	uint32_t dbuf_flags;
	int err;
	zio_t *zio;

	ASSERT(length <= DMU_MAX_ACCESS);

	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
	if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
		dbuf_flags |= DB_RF_NOPREFETCH;

	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	if (dn->dn_datablkshift) {
		int blkshift = dn->dn_datablkshift;
		nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
		    P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
	} else {
Example #16
0
static int
kgrep_range_basic(uintptr_t base, uintptr_t lim, void *kg_arg)
{
	kgrep_data_t *kg = kg_arg;
	size_t pagesize = kg->kg_pagesize;
	uintptr_t pattern = kg->kg_pattern;
	uintptr_t *page = kg->kg_page;
	uintptr_t *page_end = &page[pagesize / sizeof (uintptr_t)];
	uintptr_t *pos;

	uintptr_t addr, offset;
	int seen = 0;

	/*
	 * page-align everything, to simplify the loop
	 */
	base = P2ALIGN(base, pagesize);
	lim = P2ROUNDUP(lim, pagesize);

	for (addr = base; addr < lim; addr += pagesize) {
		if (mdb_vread(page, pagesize, addr) == -1)
			continue;
		seen = 1;

		for (pos = page; pos < page_end; pos++) {
			if (*pos != pattern)
				continue;

			offset = (caddr_t)pos - (caddr_t)page;
			kgrep_cb(addr + offset, NULL, kg->kg_cbtype);
		}
	}
	if (seen)
		kg->kg_seen = 1;

	return (WALK_NEXT);
}
Example #17
0
/*
 * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
 * to take a held dnode rather than <os, object> -- the lookup is wasteful,
 * and can induce severe lock contention when writing to several files
 * whose dnodes are in the same block.
 */
static int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
    uint64_t length, int read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp)
{
	dsl_pool_t *dp = NULL;
	dmu_buf_t **dbp;
	uint64_t blkid, nblks, i;
	uint32_t flags;
	int err;
	zio_t *zio;
	hrtime_t start;

	ASSERT(length <= DMU_MAX_ACCESS);

	flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
	if (length > zfetch_array_rd_sz)
		flags |= DB_RF_NOPREFETCH;

	rw_enter(&dn->dn_struct_rwlock, RW_READER);
	if (dn->dn_datablkshift) {
		int blkshift = dn->dn_datablkshift;
		nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
		    P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
	} else {
Example #18
0
/*
 * Allocate an entry in the cache.  At the point we don't have the data,
 * we're just creating a placeholder so that multiple threads don't all
 * go off and read the same blocks.
 */
static vdev_cache_entry_t *
vdev_cache_allocate(zio_t *zio)
{
	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
	vdev_cache_entry_t *ve;

	ASSERT(MUTEX_HELD(&vc->vc_lock));

	if (zfs_vdev_cache_size == 0)
		return (NULL);

	/*
	 * If adding a new entry would exceed the cache size,
	 * evict the oldest entry (LRU).
	 */
	if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
	    zfs_vdev_cache_size) {
		ve = avl_first(&vc->vc_lastused_tree);
		if (ve->ve_fill_io != NULL)
			return (NULL);
		ASSERT3U(ve->ve_hits, !=, 0);
		vdev_cache_evict(vc, ve);
	}
Example #19
0
				blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
				minlvl = restarted ? 1 : 2;
				restarted = B_TRUE;
				error = dnode_next_offset(DMU_META_DNODE(os),
				    DNODE_FIND_HOLE, &offset, minlvl,
				    blkfill, 0);
				if (error == 0) {
					object = offset >> DNODE_SHIFT;
				}
			}
			/*
			 * Note: if "restarted", we may find a L0 that
			 * is not suitably aligned.
			 */
			os->os_obj_next_chunk =
			    P2ALIGN(object, dnodes_per_chunk) +
			    dnodes_per_chunk;
			(void) atomic_swap_64(cpuobj, object);
			mutex_exit(&os->os_obj_lock);
		}

		/*
		 * XXX We should check for an i/o error here and return
		 * up to our caller.  Actually we should pre-read it in
		 * dmu_tx_assign(), but there is currently no mechanism
		 * to do so.
		 */
		(void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
		    dn_slots, FTAG, &dn);
		if (dn != NULL) {
			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
Example #20
0
				    bits >= 0; bits -= epbs)
					txh->txh_fudge += 1ULL << max_ibs;
				goto out;
			}
			off += delta;
			if (len >= delta)
				len -= delta;
			delta = dn->dn_datablksz;
		}
	}

	/*
	 * 'end' is the last thing we will access, not one past.
	 * This way we won't overflow when accessing the last byte.
	 */
	start = P2ALIGN(off, 1ULL << max_bs);
	end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
	txh->txh_space_towrite += end - start + 1;

	start >>= min_bs;
	end >>= min_bs;

	epbs = min_ibs - SPA_BLKPTRSHIFT;

	/*
	 * The object contains at most 2^(64 - min_bs) blocks,
	 * and each indirect level maps 2^epbs.
	 */
	for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
		start >>= epbs;
		end >>= epbs;
/*
 * Find something to roll, then if we don't have cached roll buffers
 * covering all the deltas in that MAPBLOCK then read the master
 * and overlay the deltas.
 * returns;
 * 	0 if sucessful
 *	1 on finding nothing to roll
 *	2 on error
 */
int
log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs,
    int *retnbuf)
{
	offset_t	mof;
	buf_t		*bp;
	rollbuf_t	*rbp;
	mt_map_t	*logmap = ul->un_logmap;
	daddr_t		mblkno;
	int		i;
	int		error;
	int		nbuf;

	/*
	 * Make sure there is really something to roll
	 */
	mof = 0;
	if (!logmap_next_roll(logmap, &mof)) {
		return (1);
	}

	/*
	 * build some master blocks + deltas to roll forward
	 */
	rw_enter(&logmap->mtm_rwlock, RW_READER);
	nbuf = 0;
	do {
		mof = mof & (offset_t)MAPBLOCKMASK;
		mblkno = lbtodb(mof);

		/*
		 * Check for the case of a new delta to a set up buffer
		 */
		for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
			if (P2ALIGN(rbp->rb_bh.b_blkno,
			    MAPBLOCKSIZE / DEV_BSIZE) == mblkno) {
				TNF_PROBE_0(trans_roll_new_delta, "lufs",
				    /* CSTYLED */);
				trans_roll_new_delta++;
				/* Flush out the current set of buffers */
				goto flush_bufs;
			}
		}

		/*
		 * Work out what to roll next. If it isn't cached then read
		 * it asynchronously from the master.
		 */
		bp = &rbp->rb_bh;
		bp->b_blkno = mblkno;
		bp->b_flags = B_READ;
		bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT);
		bp->b_bufsize = MAPBLOCKSIZE;
		if (top_read_roll(rbp, ul)) {
			/* logmap deltas were in use */
			if (nbuf == 0) {
				/*
				 * On first buffer wait for the logmap user
				 * to finish by grabbing the logmap lock
				 * exclusively rather than spinning
				 */
				rw_exit(&logmap->mtm_rwlock);
				lrr_wait++;
				rw_enter(&logmap->mtm_rwlock, RW_WRITER);
				rw_exit(&logmap->mtm_rwlock);
				return (1);
			}
			/* we have at least one buffer - flush it */
			goto flush_bufs;
		}
		if ((bp->b_flags & B_INVAL) == 0) {
			nbuf++;
		}
		mof += MAPBLOCKSIZE;
	} while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
Example #22
0
/*
 * Construct a stack for init containing the arguments to it, then
 * pass control to exec_common.
 */
int
exec_init(const char *initpath, const char *args)
{
	caddr32_t ucp;
	caddr32_t *uap;
	caddr32_t *argv;
	caddr32_t exec_fnamep;
	char *scratchargs;
	int i, sarg;
	size_t argvlen, alen;
	boolean_t in_arg;
	int argc = 0;
	int error = 0, count = 0;
	proc_t *p = ttoproc(curthread);
	klwp_t *lwp = ttolwp(curthread);
	int brand_action;

	if (args == NULL)
		args = "";

	alen = strlen(initpath) + 1 + strlen(args) + 1;
	scratchargs = kmem_alloc(alen, KM_SLEEP);
	(void) snprintf(scratchargs, alen, "%s %s", initpath, args);

	/*
	 * We do a quick two state parse of the string to sort out how big
	 * argc should be.
	 */
	in_arg = B_FALSE;
	for (i = 0; i < strlen(scratchargs); i++) {
		if (scratchargs[i] == ' ' || scratchargs[i] == '\0') {
			if (in_arg) {
				in_arg = B_FALSE;
				argc++;
			}
		} else {
			in_arg = B_TRUE;
		}
	}
	argvlen = sizeof (caddr32_t) * (argc + 1);
	argv = kmem_zalloc(argvlen, KM_SLEEP);

	/*
	 * We pull off a bit of a hack here.  We work our way through the
	 * args string, putting nulls at the ends of space delimited tokens
	 * (boot args don't support quoting at this time).  Then we just
	 * copy the whole mess to userland in one go.  In other words, we
	 * transform this: "init -s -r\0" into this on the stack:
	 *
	 *	-0x00 \0
	 *	-0x01 r
	 *	-0x02 -  <--------.
	 *	-0x03 \0	  |
	 *	-0x04 s		  |
	 *	-0x05 -  <------. |
	 *	-0x06 \0	| |
	 *	-0x07 t		| |
	 *	-0x08 i 	| |
	 *	-0x09 n		| |
	 *	-0x0a i  <---.  | |
	 *	-0x10 NULL   |  | |	(argv[3])
	 *	-0x14   -----|--|-'	(argv[2])
	 *	-0x18  ------|--'	(argv[1])
	 *	-0x1c -------'		(argv[0])
	 *
	 * Since we know the value of ucp at the beginning of this process,
	 * we can trivially compute the argv[] array which we also need to
	 * place in userland: argv[i] = ucp - sarg(i), where ucp is the
	 * stack ptr, and sarg is the string index of the start of the
	 * argument.
	 */
	ucp = (caddr32_t)(uintptr_t)p->p_usrstack;

	argc = 0;
	in_arg = B_FALSE;
	sarg = 0;

	for (i = 0; i < alen; i++) {
		if (scratchargs[i] == ' ' || scratchargs[i] == '\0') {
			if (in_arg == B_TRUE) {
				in_arg = B_FALSE;
				scratchargs[i] = '\0';
				argv[argc++] = ucp - (alen - sarg);
			}
		} else if (in_arg == B_FALSE) {
			in_arg = B_TRUE;
			sarg = i;
		}
	}
	ucp -= alen;
	error |= copyout(scratchargs, (caddr_t)(uintptr_t)ucp, alen);

	uap = (caddr32_t *)P2ALIGN((uintptr_t)ucp, sizeof (caddr32_t));
	uap--;	/* advance to be below the word we're in */
	uap -= (argc + 1);	/* advance argc words down, plus one for NULL */
	error |= copyout(argv, uap, argvlen);

	if (error != 0) {
		zcmn_err(p->p_zone->zone_id, CE_WARN,
		    "Could not construct stack for init.\n");
		kmem_free(argv, argvlen);
		kmem_free(scratchargs, alen);
		return (EFAULT);
	}

	exec_fnamep = argv[0];
	kmem_free(argv, argvlen);
	kmem_free(scratchargs, alen);

	/*
	 * Point at the arguments.
	 */
	lwp->lwp_ap = lwp->lwp_arg;
	lwp->lwp_arg[0] = (uintptr_t)exec_fnamep;
	lwp->lwp_arg[1] = (uintptr_t)uap;
	lwp->lwp_arg[2] = NULL;
	curthread->t_post_sys = 1;
	curthread->t_sysnum = SYS_execve;

	/*
	 * If we are executing init from zsched, we may have inherited its
	 * parent process's signal mask.  Clear it now so that we behave in
	 * the same way as when started from the global zone.
	 */
	sigemptyset(&curthread->t_hold);

	brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE;
again:
	error = exec_common((const char *)(uintptr_t)exec_fnamep,
	    (const char **)(uintptr_t)uap, NULL, brand_action);

	/*
	 * Normally we would just set lwp_argsaved and t_post_sys and
	 * let post_syscall reset lwp_ap for us.  Unfortunately,
	 * exec_init isn't always called from a system call.  Instead
	 * of making a mess of trap_cleanup, we just reset the args
	 * pointer here.
	 */
	reset_syscall_args();

	switch (error) {
	case 0:
		return (0);

	case ENOENT:
		zcmn_err(p->p_zone->zone_id, CE_WARN,
		    "exec(%s) failed (file not found).\n", initpath);
		return (ENOENT);

	case EAGAIN:
	case EINTR:
		++count;
		if (count < 5) {
			zcmn_err(p->p_zone->zone_id, CE_WARN,
			    "exec(%s) failed with errno %d.  Retrying...\n",
			    initpath, error);
			goto again;
		}
	}

	zcmn_err(p->p_zone->zone_id, CE_WARN,
	    "exec(%s) failed with errno %d.", initpath, error);
	return (error);
}
Example #23
0
/*
 * This is the prefetch entry point.  It calls all of the other dmu_zfetch
 * routines to create, delete, find, or operate upon prefetch streams.
 */
void
dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
{
	zstream_t	zst;
	zstream_t	*newstream;
	int		fetched;
	int		inserted;
	unsigned int	blkshft;
	uint64_t	blksz;

	if (zfs_prefetch_disable)
		return;

	/* files that aren't ln2 blocksz are only one block -- nothing to do */
	if (!zf->zf_dnode->dn_datablkshift)
		return;

	/* convert offset and size, into blockid and nblocks */
	blkshft = zf->zf_dnode->dn_datablkshift;
	blksz = (1 << blkshft);

	bzero(&zst, sizeof (zstream_t));
	zst.zst_offset = offset >> blkshft;
	zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
	    P2ALIGN(offset, blksz)) >> blkshft;

	fetched = dmu_zfetch_find(zf, &zst, prefetched);
	if (!fetched) {
		fetched = dmu_zfetch_colinear(zf, &zst);
	}

	if (!fetched) {
		newstream = dmu_zfetch_stream_reclaim(zf);

		/*
		 * we still couldn't find a stream, drop the lock, and allocate
		 * one if possible.  Otherwise, give up and go home.
		 */
		if (newstream == NULL) {
			uint64_t	maxblocks;
			uint32_t	max_streams;
			uint32_t	cur_streams;

			cur_streams = zf->zf_stream_cnt;
			maxblocks = zf->zf_dnode->dn_maxblkid;

			max_streams = MIN(zfetch_max_streams,
			    (maxblocks / zfetch_block_cap));
			if (max_streams == 0) {
				max_streams++;
			}

			if (cur_streams >= max_streams) {
				return;
			}

			newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
		}

		newstream->zst_offset = zst.zst_offset;
		newstream->zst_len = zst.zst_len;
		newstream->zst_stride = zst.zst_len;
		newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
		newstream->zst_cap = zst.zst_len;
		newstream->zst_direction = ZFETCH_FORWARD;
		newstream->zst_last = lbolt;

		mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);

		rw_enter(&zf->zf_rwlock, RW_WRITER);
		inserted = dmu_zfetch_stream_insert(zf, newstream);
		rw_exit(&zf->zf_rwlock);

		if (!inserted) {
			mutex_destroy(&newstream->zst_lock);
			kmem_free(newstream, sizeof (zstream_t));
		}
	}
}
Example #24
0
static inline slice_t *
slice_small_get_slice_from_row(void *buf, small_allocatable_row_t **row)
{
	(*row) = (small_allocatable_row_t *)buf;
	return (slice_t*)P2ALIGN((uint64_t)buf, (uint64_t)PAGE_SIZE);
}
Example #25
0
/*
 * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
 */
int
vdev_cache_read(zio_t *zio)
{
	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	vdev_cache_entry_t *ve, ve_search;
	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
	zio_t *fio;

	ASSERT(zio->io_type == ZIO_TYPE_READ);

	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
		return (EINVAL);

	if (zio->io_size > zfs_vdev_cache_max)
		return (EOVERFLOW);

	/*
	 * If the I/O straddles two or more cache blocks, don't cache it.
	 */
	if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
		return (EXDEV);

	ASSERT(cache_phase + zio->io_size <= VCBS);

	mutex_enter(&vc->vc_lock);

	ve_search.ve_offset = cache_offset;
	ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);

	if (ve != NULL) {
		if (ve->ve_missed_update) {
			mutex_exit(&vc->vc_lock);
			return (ESTALE);
		}

		if ((fio = ve->ve_fill_io) != NULL) {
			zio_vdev_io_bypass(zio);
			zio_add_child(zio, fio);
			mutex_exit(&vc->vc_lock);
			VDCSTAT_BUMP(vdc_stat_delegations);
			return (0);
		}

		vdev_cache_hit(vc, ve, zio);
		zio_vdev_io_bypass(zio);

		mutex_exit(&vc->vc_lock);
		VDCSTAT_BUMP(vdc_stat_hits);
		return (0);
	}

	ve = vdev_cache_allocate(zio);

	if (ve == NULL) {
		mutex_exit(&vc->vc_lock);
		return (ENOMEM);
	}

	fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);

	ve->ve_fill_io = fio;
	zio_vdev_io_bypass(zio);
	zio_add_child(zio, fio);

	mutex_exit(&vc->vc_lock);
	zio_nowait(fio);
	VDCSTAT_BUMP(vdc_stat_misses);

	return (0);
}
Example #26
0
/*
 * This routine assumes that the stack grows downward.
 * Returns 0 on success, errno on failure.
 */
int
grow_internal(caddr_t sp, uint_t growszc)
{
	struct proc *p = curproc;
	size_t newsize;
	size_t oldsize;
	int    error;
	size_t pgsz;
	uint_t szc;
	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);

	ASSERT(sp < p->p_usrstack);
	sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);

	/*
	 * grow to growszc alignment but use current p->p_stkpageszc for
	 * the segvn_crargs szc passed to segvn_create. For memcntl to
	 * increase the szc, this allows the new extension segment to be
	 * concatenated successfully with the existing stack segment.
	 */
	if ((szc = growszc) != 0) {
		pgsz = page_get_pagesize(szc);
		ASSERT(pgsz > PAGESIZE);
		newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
		if (newsize > (size_t)p->p_stk_ctl) {
			szc = 0;
			pgsz = PAGESIZE;
			newsize = p->p_usrstack - sp;
		}
	} else {
		pgsz = PAGESIZE;
		newsize = p->p_usrstack - sp;
	}

	if (newsize > (size_t)p->p_stk_ctl) {
		(void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
		    RCA_UNSAFE_ALL);

		return (ENOMEM);
	}

	oldsize = p->p_stksize;
	ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);

	if (newsize <= oldsize) {	/* prevent the stack from shrinking */
		return (0);
	}

	if (!(p->p_stkprot & PROT_EXEC)) {
		crargs.prot &= ~PROT_EXEC;
	}
	/*
	 * extend stack with the proposed new growszc, which is different
	 * than p_stkpageszc only on a memcntl to increase the stack pagesize.
	 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
	 * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
	 * if not aligned to szc's pgsz.
	 */
	if (szc > 0) {
		caddr_t oldsp = p->p_usrstack - oldsize;
		caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
		    pgsz);

		if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
			crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
			    AS_MAP_NO_LPOOB;
		} else if (oldsp == austk) {
			crargs.szc = szc;
		} else {
			crargs.szc = AS_MAP_STACK;
		}
	} else {
		crargs.szc = AS_MAP_NO_LPOOB;
	}
	crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;

	if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
	    segvn_create, &crargs)) != 0) {
		if (error == EAGAIN) {
			cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
			    "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
		}
		return (error);
	}
	p->p_stksize = newsize;
	return (0);
}
Example #27
0
/*
 * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
 */
int
vdev_cache_read(zio_t *zio)
{
	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	vdev_cache_entry_t *ve, ve_search;
	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
	zio_t *fio;

	ASSERT(zio->io_type == ZIO_TYPE_READ);

	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
		return (EINVAL);

	if (zio->io_size > zfs_vdev_cache_max)
		return (EOVERFLOW);

	/*
	 * If the I/O straddles two or more cache blocks, don't cache it.
	 */
	if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS))
		return (EXDEV);

	ASSERT(cache_phase + zio->io_size <= VCBS);

	mutex_enter(&vc->vc_lock);

	ve_search.ve_offset = cache_offset;
	ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);

	if (ve != NULL) {
		if (ve->ve_missed_update) {
			mutex_exit(&vc->vc_lock);
			return (ESTALE);
		}

		if ((fio = ve->ve_fill_io) != NULL) {
			zio->io_delegate_next = fio->io_delegate_list;
			fio->io_delegate_list = zio;
			zio_vdev_io_bypass(zio);
			mutex_exit(&vc->vc_lock);
			return (0);
		}

		vdev_cache_hit(vc, ve, zio);
		zio_vdev_io_bypass(zio);

		mutex_exit(&vc->vc_lock);
		zio_next_stage(zio);
		return (0);
	}

	if (!(zio->io_flags & ZIO_FLAG_METADATA)) {
		mutex_exit(&vc->vc_lock);
		return (EINVAL);
	}

	ve = vdev_cache_allocate(zio);

	if (ve == NULL) {
		mutex_exit(&vc->vc_lock);
		return (ENOMEM);
	}

	fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
	    ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
	    vdev_cache_fill, ve);

	ve->ve_fill_io = fio;
	fio->io_delegate_list = zio;
	zio_vdev_io_bypass(zio);

	mutex_exit(&vc->vc_lock);
	zio_nowait(fio);

	return (0);
}
Example #28
0
/*
 * Returns an intersection of the [start, end] interval and the range specified
 * by -A flag [start_addr, end_addr]. Unspecified parts of the address range
 * have value INVALID_ADDRESS.
 *
 * The start_addr address is rounded down to the beginning of page and end_addr
 * is rounded up to the end of page.
 *
 * Returns the size of the resulting interval or zero if the interval is empty
 * or invalid.
 */
static size_t
adjust_addr_range(uintptr_t start, uintptr_t end, size_t psz,
                  uintptr_t *new_start, uintptr_t *new_end)
{
    uintptr_t from;		/* start_addr rounded down */
    uintptr_t to;		/* end_addr rounded up */

    /*
     * Round down the lower address of the range to the beginning of page.
     */
    if (start_addr == INVALID_ADDRESS) {
        /*
         * No start_addr specified by -A, the lower part of the interval
         * does not change.
         */
        *new_start = start;
    } else {
        from = P2ALIGN(start_addr, psz);
        /*
         * If end address is outside the range, return an empty
         * interval
         */
        if (end <  from) {
            *new_start = *new_end = 0;
            return (0);
        }
        /*
         * The adjusted start address is the maximum of requested start
         * and the aligned start_addr of the -A range.
         */
        *new_start = start < from ? from : start;
    }

    /*
     * Round up the higher address of the range to the end of page.
     */
    if (end_addr == INVALID_ADDRESS) {
        /*
         * No end_addr specified by -A, the upper part of the interval
         * does not change.
         */
        *new_end = end;
    } else {
        /*
         * If only one address is specified and it is the beginning of a
         * segment, get information about the whole segment. This
         * function is called once per segment and the 'end' argument is
         * always the end of a segment, so just use the 'end' value.
         */
        to = (end_addr == start_addr && start == start_addr) ?
             end :
             P2ALIGN(end_addr + psz, psz);
        /*
         * If start address is outside the range, return an empty
         * interval
         */
        if (start > to) {
            *new_start = *new_end = 0;
            return (0);
        }
        /*
         * The adjusted end address is the minimum of requested end
         * and the aligned end_addr of the -A range.
         */
        *new_end = end > to ? to : end;
    }

    /*
     * Make sure that the resulting interval is legal.
     */
    if (*new_end < *new_start)
        *new_start = *new_end = 0;

    /* Return the size of the interval */
    return (*new_end - *new_start);
}
Example #29
0
/*
 * Algorithm: call arch-specific map_pgsz to get best page size to use,
 * then call grow_internal().
 * Returns 0 on success.
 */
static int
grow_lpg(caddr_t sp)
{
	struct proc *p = curproc;
	size_t pgsz;
	size_t len, newsize;
	caddr_t addr, saddr;
	caddr_t growend;
	int oszc, szc;
	int err;

	newsize = p->p_usrstack - sp;

	oszc = p->p_stkpageszc;
	pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
	szc = page_szc(pgsz);

	/*
	 * Covers two cases:
	 * 1. page_szc() returns -1 for invalid page size, so we want to
	 * ignore it in that case.
	 * 2. By design we never decrease page size, as it is more stable.
	 * This shouldn't happen as the stack never shrinks.
	 */
	if (szc <= oszc) {
		err = grow_internal(sp, oszc);
		/* failed, fall back to base page size */
		if (err != 0 && oszc != 0) {
			err = grow_internal(sp, 0);
		}
		return (err);
	}

	/*
	 * We've grown sufficiently to switch to a new page size.
	 * So we are going to remap the whole segment with the new page size.
	 */
	err = grow_internal(sp, szc);
	/* The grow with szc failed, so fall back to base page size. */
	if (err != 0) {
		if (szc != 0) {
			err = grow_internal(sp, 0);
		}
		return (err);
	}

	/*
	 * Round up stack pointer to a large page boundary and remap
	 * any pgsz pages in the segment already faulted in beyond that
	 * point.
	 */
	saddr = p->p_usrstack - p->p_stksize;
	addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
	growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
	len = growend - addr;
	/* Check that len is not negative. Update page size code for stack. */
	if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
		p->p_stkpageszc = szc;
	}

	ASSERT(err == 0);
	return (err);		/* should always be 0 */
}