예제 #1
0
파일: dmu_object.c 프로젝트: harshada/zfs
uint64_t
dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
	objset_impl_t *osi = os->os;
	uint64_t object;
	uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
	    (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
	dnode_t *dn = NULL;
	int restarted = B_FALSE;

	mutex_enter(&osi->os_obj_lock);
	for (;;) {
		object = osi->os_obj_next;
		/*
		 * Each time we polish off an L2 bp worth of dnodes
		 * (2^13 objects), move to another L2 bp that's still
		 * reasonably sparse (at most 1/4 full).  Look from the
		 * beginning once, but after that keep looking from here.
		 * If we can't find one, just keep going from here.
		 */
		if (P2PHASE(object, L2_dnode_count) == 0) {
			uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
			int error = dnode_next_offset(osi->os_meta_dnode,
			    DNODE_FIND_HOLE,
			    &offset, 2, DNODES_PER_BLOCK >> 2, 0);
			restarted = B_TRUE;
			if (error == 0)
				object = offset >> DNODE_SHIFT;
		}
		osi->os_obj_next = ++object;

		/*
		 * XXX We should check for an i/o error here and return
		 * up to our caller.  Actually we should pre-read it in
		 * dmu_tx_assign(), but there is currently no mechanism
		 * to do so.
		 */
		(void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE,
		    FTAG, &dn);
		if (dn)
			break;

		if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
			osi->os_obj_next = object - 1;
	}
예제 #2
0
static void
vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
{
	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);

	ASSERT(MUTEX_HELD(&vc->vc_lock));
	ASSERT(ve->ve_fill_io == NULL);

	if (ve->ve_lastused != lbolt) {
		avl_remove(&vc->vc_lastused_tree, ve);
		ve->ve_lastused = lbolt;
		avl_add(&vc->vc_lastused_tree, ve);
	}

	ve->ve_hits++;
	bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
}
예제 #3
0
int
bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
{
	uint64_t blk, off;
	blkptr_t *bparray;
	int err;

	ASSERT(!BP_IS_HOLE(bp));
	mutex_enter(&bpl->bpl_lock);
	err = bplist_hold(bpl);
	if (err)
		return (err);

	blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
	off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);

	err = bplist_cache(bpl, blk);
	if (err) {
		mutex_exit(&bpl->bpl_lock);
		return (err);
	}

	dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
	bparray = bpl->bpl_cached_dbuf->db_data;
	bparray[off] = *bp;

	/* We never need the fill count. */
	bparray[off].blk_fill = 0;

	/* The bplist will compress better if we can leave off the checksum */
	if (!BP_GET_DEDUP(&bparray[off]))
		bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));

	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
	bpl->bpl_phys->bpl_entries++;
	bpl->bpl_phys->bpl_bytes +=
	    bp_get_dsize_sync(dmu_objset_spa(bpl->bpl_mos), bp);
	if (bpl->bpl_havecomp) {
		bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
		bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
	}
	mutex_exit(&bpl->bpl_lock);

	return (0);
}
예제 #4
0
파일: misc.c 프로젝트: sriramnrn/zfs-port
static void
umem_log_enter(const char *error_str, int serious)
{
#ifdef __native_client__
    if ( s_zfslog_fd < 0 ){
	s_zfslog_fd = open(ZVM_ZFS_LOG, O_WRONLY);
    }
    write(s_zfslog_fd, error_str, strlen(error_str));
#else
	int looped;
	char c;

	looped = 0;
#ifdef ECELERITY
	mem_printf(serious ? DCRITICAL : DINFO, "umem: %s", error_str);
#endif

	(void) mutex_lock(&umem_error_lock);

	while ((c = *error_str++) != '\0') {
		WRITE_AND_INC(umem_error_end, c);
		if (umem_error_end == umem_error_begin)
			looped = 1;
	}

	umem_error_buffer[umem_error_end] = 0;

	if (looped) {
		uint_t idx;
		umem_error_begin = P2PHASE(umem_error_end + 1, ERR_SIZE);

		idx = umem_error_begin;
		WRITE_AND_INC(idx, '.');
		WRITE_AND_INC(idx, '.');
		WRITE_AND_INC(idx, '.');
	}

	(void) mutex_unlock(&umem_error_lock);
#endif //__native_client__
}
예제 #5
0
/* ARGSUSED */
static void
dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
{
	dnode_t *dn = txh->txh_dnode;
	uint64_t start, end, i;
	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
	int err = 0;

	if (len == 0)
		return;

	min_bs = SPA_MINBLOCKSHIFT;
	max_bs = SPA_MAXBLOCKSHIFT;
	min_ibs = DN_MIN_INDBLKSHIFT;
	max_ibs = DN_MAX_INDBLKSHIFT;

	if (dn) {
		uint64_t history[DN_MAX_LEVELS];
		int nlvls = dn->dn_nlevels;
		int delta;

		/*
		 * For i/o error checking, read the first and last level-0
		 * blocks (if they are not aligned), and all the level-1 blocks.
		 */
		if (dn->dn_maxblkid == 0) {
			delta = dn->dn_datablksz;
			start = (off < dn->dn_datablksz) ? 0 : 1;
			end = (off+len <= dn->dn_datablksz) ? 0 : 1;
			if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
				err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
				if (err)
					goto out;
				delta -= off;
			}
		} else {
			zio_t *zio = zio_root(dn->dn_objset->os_spa,
			    NULL, NULL, ZIO_FLAG_CANFAIL);

			/* first level-0 block */
			start = off >> dn->dn_datablkshift;
			if (P2PHASE(off, dn->dn_datablksz) ||
			    len < dn->dn_datablksz) {
				err = dmu_tx_check_ioerr(zio, dn, 0, start);
				if (err)
					goto out;
			}

			/* last level-0 block */
			end = (off+len-1) >> dn->dn_datablkshift;
			if (end != start && end <= dn->dn_maxblkid &&
			    P2PHASE(off+len, dn->dn_datablksz)) {
				err = dmu_tx_check_ioerr(zio, dn, 0, end);
				if (err)
					goto out;
			}

			/* level-1 blocks */
			if (nlvls > 1) {
				int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
				for (i = (start>>shft)+1; i < end>>shft; i++) {
					err = dmu_tx_check_ioerr(zio, dn, 1, i);
					if (err)
						goto out;
				}
			}

			err = zio_wait(zio);
			if (err)
				goto out;
			delta = P2NPHASE(off, dn->dn_datablksz);
		}

		min_ibs = max_ibs = dn->dn_indblkshift;
		if (dn->dn_maxblkid > 0) {
			/*
			 * The blocksize can't change,
			 * so we can make a more precise estimate.
			 */
			ASSERT(dn->dn_datablkshift != 0);
			min_bs = max_bs = dn->dn_datablkshift;
		}

		/*
		 * If this write is not off the end of the file
		 * we need to account for overwrites/unref.
		 */
		if (start <= dn->dn_maxblkid) {
			for (int l = 0; l < DN_MAX_LEVELS; l++)
				history[l] = -1ULL;
		}
		while (start <= dn->dn_maxblkid) {
			dmu_buf_impl_t *db;

			rw_enter(&dn->dn_struct_rwlock, RW_READER);
			err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
			rw_exit(&dn->dn_struct_rwlock);

			if (err) {
				txh->txh_tx->tx_err = err;
				return;
			}

			dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
			    history);
			dbuf_rele(db, FTAG);
			if (++start > end) {
				/*
				 * Account for new indirects appearing
				 * before this IO gets assigned into a txg.
				 */
				bits = 64 - min_bs;
				epbs = min_ibs - SPA_BLKPTRSHIFT;
				for (bits -= epbs * (nlvls - 1);
				    bits >= 0; bits -= epbs)
					txh->txh_fudge += 1ULL << max_ibs;
				goto out;
			}
			off += delta;
			if (len >= delta)
				len -= delta;
			delta = dn->dn_datablksz;
		}
	}
예제 #6
0
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
    znode_t *zp, offset_t off, ssize_t resid, int ioflag)
{
	uint32_t blocksize = zp->z_blksz;
	itx_wr_state_t write_state;
	uintptr_t fsync_cnt;

	if (zil_replaying(zilog, tx) || zp->z_unlinked)
		return;

	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
		write_state = WR_INDIRECT;
	else if (!spa_has_slogs(zilog->zl_spa) &&
	    resid >= zfs_immediate_write_sz)
		write_state = WR_INDIRECT;
	else if (ioflag & (FSYNC | FDSYNC))
		write_state = WR_COPIED;
	else
		write_state = WR_NEED_COPY;

	if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
		(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
	}

	while (resid) {
		itx_t *itx;
		lr_write_t *lr;
		itx_wr_state_t wr_state = write_state;
		ssize_t len = resid;

		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
			wr_state = WR_NEED_COPY;
		else if (wr_state == WR_INDIRECT)
			len = MIN(blocksize - P2PHASE(off, blocksize), resid);

		itx = zil_itx_create(txtype, sizeof (*lr) +
		    (wr_state == WR_COPIED ? len : 0));
		lr = (lr_write_t *)&itx->itx_lr;
		if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
		    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
			zil_itx_destroy(itx);
			itx = zil_itx_create(txtype, sizeof (*lr));
			lr = (lr_write_t *)&itx->itx_lr;
			wr_state = WR_NEED_COPY;
		}

		itx->itx_wr_state = wr_state;
		lr->lr_foid = zp->z_id;
		lr->lr_offset = off;
		lr->lr_length = len;
		lr->lr_blkoff = 0;
		BP_ZERO(&lr->lr_blkptr);

		itx->itx_private = zp->z_zfsvfs;

		if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) &&
		    (fsync_cnt == 0))
			itx->itx_sync = B_FALSE;

		zil_itx_assign(zilog, itx, tx);

		off += len;
		resid -= len;
	}
}
예제 #7
0
/*
 * Load the space map disk into the specified range tree. Segments of maptype
 * are added to the range tree, other segment types are removed.
 *
 * Note: space_map_load() will drop sm_lock across dmu_read() calls.
 * The caller must be OK with this.
 */
int
space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
{
	uint64_t *entry, *entry_map, *entry_map_end;
	uint64_t bufsize, size, offset, end, space;
	int error = 0;

	ASSERT(MUTEX_HELD(sm->sm_lock));

	end = space_map_length(sm);
	space = space_map_allocated(sm);

	VERIFY0(range_tree_space(rt));

	if (maptype == SM_FREE) {
		range_tree_add(rt, sm->sm_start, sm->sm_size);
		space = sm->sm_size - space;
	}

	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
	entry_map = zio_buf_alloc(bufsize);

	mutex_exit(sm->sm_lock);
	if (end > bufsize) {
		dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
		    end - bufsize, ZIO_PRIORITY_SYNC_READ);
	}
	mutex_enter(sm->sm_lock);

	for (offset = 0; offset < end; offset += bufsize) {
		size = MIN(end - offset, bufsize);
		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
		VERIFY(size != 0);
		ASSERT3U(sm->sm_blksz, !=, 0);

		dprintf("object=%llu  offset=%llx  size=%llx\n",
		    space_map_object(sm), offset, size);

		mutex_exit(sm->sm_lock);
		error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
		    entry_map, DMU_READ_PREFETCH);
		mutex_enter(sm->sm_lock);
		if (error != 0)
			break;

		entry_map_end = entry_map + (size / sizeof (uint64_t));
		for (entry = entry_map; entry < entry_map_end; entry++) {
			uint64_t e = *entry;
			uint64_t offset, size;

			if (SM_DEBUG_DECODE(e))		/* Skip debug entries */
				continue;

			offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
			    sm->sm_start;
			size = SM_RUN_DECODE(e) << sm->sm_shift;

			VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
			VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
			VERIFY3U(offset, >=, sm->sm_start);
			VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
			if (SM_TYPE_DECODE(e) == maptype) {
				VERIFY3U(range_tree_space(rt) + size, <=,
				    sm->sm_size);
				range_tree_add(rt, offset, size);
			} else {
				range_tree_remove(rt, offset, size);
			}
		}
예제 #8
0
/*
 * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
 */
int
vdev_cache_read(zio_t *zio)
{
	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	vdev_cache_entry_t *ve, ve_search;
	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
	zio_t *fio;

	ASSERT(zio->io_type == ZIO_TYPE_READ);

	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
		return (EINVAL);

	if (zio->io_size > zfs_vdev_cache_max)
		return (EOVERFLOW);

	/*
	 * If the I/O straddles two or more cache blocks, don't cache it.
	 */
	if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
		return (EXDEV);

	ASSERT(cache_phase + zio->io_size <= VCBS);

	mutex_enter(&vc->vc_lock);

	ve_search.ve_offset = cache_offset;
	ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);

	if (ve != NULL) {
		if (ve->ve_missed_update) {
			mutex_exit(&vc->vc_lock);
			return (ESTALE);
		}

		if ((fio = ve->ve_fill_io) != NULL) {
			zio_vdev_io_bypass(zio);
			zio_add_child(zio, fio);
			mutex_exit(&vc->vc_lock);
			VDCSTAT_BUMP(vdc_stat_delegations);
			return (0);
		}

		vdev_cache_hit(vc, ve, zio);
		zio_vdev_io_bypass(zio);

		mutex_exit(&vc->vc_lock);
		VDCSTAT_BUMP(vdc_stat_hits);
		return (0);
	}

	ve = vdev_cache_allocate(zio);

	if (ve == NULL) {
		mutex_exit(&vc->vc_lock);
		return (ENOMEM);
	}

	fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);

	ve->ve_fill_io = fio;
	zio_vdev_io_bypass(zio);
	zio_add_child(zio, fio);

	mutex_exit(&vc->vc_lock);
	zio_nowait(fio);
	VDCSTAT_BUMP(vdc_stat_misses);

	return (0);
}
예제 #9
0
/*
 * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
 */
int
vdev_cache_read(zio_t *zio)
{
	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	vdev_cache_entry_t *ve, ve_search;
	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
	zio_t *fio;

	ASSERT(zio->io_type == ZIO_TYPE_READ);

	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
		return (EINVAL);

	if (zio->io_size > zfs_vdev_cache_max)
		return (EOVERFLOW);

	/*
	 * If the I/O straddles two or more cache blocks, don't cache it.
	 */
	if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS))
		return (EXDEV);

	ASSERT(cache_phase + zio->io_size <= VCBS);

	mutex_enter(&vc->vc_lock);

	ve_search.ve_offset = cache_offset;
	ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);

	if (ve != NULL) {
		if (ve->ve_missed_update) {
			mutex_exit(&vc->vc_lock);
			return (ESTALE);
		}

		if ((fio = ve->ve_fill_io) != NULL) {
			zio->io_delegate_next = fio->io_delegate_list;
			fio->io_delegate_list = zio;
			zio_vdev_io_bypass(zio);
			mutex_exit(&vc->vc_lock);
			return (0);
		}

		vdev_cache_hit(vc, ve, zio);
		zio_vdev_io_bypass(zio);

		mutex_exit(&vc->vc_lock);
		zio_next_stage(zio);
		return (0);
	}

	if (!(zio->io_flags & ZIO_FLAG_METADATA)) {
		mutex_exit(&vc->vc_lock);
		return (EINVAL);
	}

	ve = vdev_cache_allocate(zio);

	if (ve == NULL) {
		mutex_exit(&vc->vc_lock);
		return (ENOMEM);
	}

	fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
	    ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
	    vdev_cache_fill, ve);

	ve->ve_fill_io = fio;
	fio->io_delegate_list = zio;
	zio_vdev_io_bypass(zio);

	mutex_exit(&vc->vc_lock);
	zio_nowait(fio);

	return (0);
}
예제 #10
0
파일: zvol.c 프로젝트: andreiw/polaris
int
zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len,
    char *addr)
{
	dmu_object_info_t doi;
	ssize_t nbytes;
	itx_t *itx;
	lr_write_t *lr;
	objset_t *os;
	dmu_buf_t *db;
	uint64_t txg;
	uint64_t boff;
	int error;
	uint32_t blocksize;

	/* handle common case */
	if (len <= zvol_immediate_write_sz) {
		itx = zvol_immediate_itx(off, len, addr);
		(void) zil_itx_assign(zv->zv_zilog, itx, tx);
		return (0);
	}

	txg = dmu_tx_get_txg(tx);
	os = zv->zv_objset;

	/*
	 * We need to dmu_sync() each block in the range.
	 * For this we need the blocksize.
	 */
	error = dmu_object_info(os, ZVOL_OBJ, &doi);
	if (error)
		return (error);
	blocksize = doi.doi_data_block_size;

	/*
	 * We need to immediate write or dmu_sync() each block in the range.
	 */
	while (len) {
		nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
		if (nbytes <= zvol_immediate_write_sz) {
			itx = zvol_immediate_itx(off, nbytes, addr);
		} else {
			boff =  P2ALIGN_TYPED(off, blocksize, uint64_t);
			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
			lr = (lr_write_t *)&itx->itx_lr;
			lr->lr_foid = ZVOL_OBJ;
			lr->lr_offset = off;
			lr->lr_length = nbytes;
			lr->lr_blkoff = off - boff;
			BP_ZERO(&lr->lr_blkptr);

			/* XXX - we should do these IOs in parallel */
			VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, boff,
			    FTAG, &db));
			ASSERT(boff == db->db_offset);
			error = dmu_sync(NULL, db, &lr->lr_blkptr,
			    txg, NULL, NULL);
			dmu_buf_rele(db, FTAG);
			if (error) {
				kmem_free(itx, offsetof(itx_t, itx_lr));
				return (error);
			}
			itx->itx_wr_state = WR_COPIED;
		}
		(void) zil_itx_assign(zv->zv_zilog, itx, tx);
		len -= nbytes;
		off += nbytes;
	}
	return (0);
}
예제 #11
0
uint64_t
dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
{
	uint64_t object;
	uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
	    (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
	dnode_t *dn = NULL;
	int dn_slots = dnodesize >> DNODE_SHIFT;
	boolean_t restarted = B_FALSE;
	uint64_t *cpuobj = NULL;
	int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;

	kpreempt_disable();
	cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
	    os->os_obj_next_percpu_len];
	kpreempt_enable();

	if (dn_slots == 0) {
		dn_slots = DNODE_MIN_SLOTS;
	} else {
		ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
		ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
	}

	/*
	 * The "chunk" of dnodes that is assigned to a CPU-specific
	 * allocator needs to be at least one block's worth, to avoid
	 * lock contention on the dbuf.  It can be at most one L1 block's
	 * worth, so that the "rescan after polishing off a L1's worth"
	 * logic below will be sure to kick in.
	 */
	if (dnodes_per_chunk < DNODES_PER_BLOCK)
		dnodes_per_chunk = DNODES_PER_BLOCK;
	if (dnodes_per_chunk > L1_dnode_count)
		dnodes_per_chunk = L1_dnode_count;

	object = *cpuobj;
	for (;;) {
		/*
		 * If we finished a chunk of dnodes, get a new one from
		 * the global allocator.
		 */
		if (P2PHASE(object, dnodes_per_chunk) == 0) {
			mutex_enter(&os->os_obj_lock);
			ASSERT0(P2PHASE(os->os_obj_next_chunk,
			    dnodes_per_chunk));
			object = os->os_obj_next_chunk;

			/*
			 * Each time we polish off a L1 bp worth of dnodes
			 * (2^12 objects), move to another L1 bp that's
			 * still reasonably sparse (at most 1/4 full). Look
			 * from the beginning at most once per txg. If we
			 * still can't allocate from that L1 block, search
			 * for an empty L0 block, which will quickly skip
			 * to the end of the metadnode if no nearby L0
			 * blocks are empty. This fallback avoids a
			 * pathology where full dnode blocks containing
			 * large dnodes appear sparse because they have a
			 * low blk_fill, leading to many failed allocation
			 * attempts. In the long term a better mechanism to
			 * search for sparse metadnode regions, such as
			 * spacemaps, could be implemented.
			 *
			 * os_scan_dnodes is set during txg sync if enough
			 * objects have been freed since the previous
			 * rescan to justify backfilling again.
			 *
			 * Note that dmu_traverse depends on the behavior
			 * that we use multiple blocks of the dnode object
			 * before going back to reuse objects.  Any change
			 * to this algorithm should preserve that property
			 * or find another solution to the issues described
			 * in traverse_visitbp.
			 */
			if (P2PHASE(object, L1_dnode_count) == 0) {
				uint64_t offset;
				uint64_t blkfill;
				int minlvl;
				int error;
				if (os->os_rescan_dnodes) {
					offset = 0;
					os->os_rescan_dnodes = B_FALSE;
				} else {
					offset = object << DNODE_SHIFT;
				}
				blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
				minlvl = restarted ? 1 : 2;
				restarted = B_TRUE;
				error = dnode_next_offset(DMU_META_DNODE(os),
				    DNODE_FIND_HOLE, &offset, minlvl,
				    blkfill, 0);
				if (error == 0) {
					object = offset >> DNODE_SHIFT;
				}
			}
예제 #12
0
int
zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
    uint64_t *destsizep, uint64_t *destbufsizep)
{
	uint64_t *word, *word_end;
	uint64_t ciosize, gapsize, destbufsize;
	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
	char *dest;
	uint_t allzero;

	ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
	ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);

	/*
	 * If the data is all zeroes, we don't even need to allocate
	 * a block for it.  We indicate this by setting *destsizep = 0.
	 */
	allzero = 1;
	word = src;
	word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
	while (word < word_end) {
		if (*word++ != 0) {
			allzero = 0;
			break;
		}
	}
	if (allzero) {
		*destp = NULL;
		*destsizep = 0;
		*destbufsizep = 0;
		return (1);
	}

	if (cpfunc == ZIO_COMPRESS_EMPTY)
		return (0);

	/* Compress at least 12.5% */
	destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
	if (destbufsize == 0)
		return (0);
	dest = zio_buf_alloc(destbufsize);

	ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
	    (size_t)destbufsize, ci->ci_level);
	if (ciosize > destbufsize) {
		zio_buf_free(dest, destbufsize);
		return (0);
	}

	/* Cool.  We compressed at least as much as we were hoping to. */

	/* For security, make sure we don't write random heap crap to disk */
	gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
	if (gapsize != 0) {
		bzero(dest + ciosize, gapsize);
		ciosize += gapsize;
	}

	ASSERT3U(ciosize, <=, destbufsize);
	ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
	*destp = dest;
	*destsizep = ciosize;
	*destbufsizep = destbufsize;

	return (1);
}
예제 #13
0
/* ARGSUSED */
static void
dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
{
	dnode_t *dn = txh->txh_dnode;
	uint64_t start, end, i;
	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
	int err = 0;

	if (len == 0)
		return;

	min_bs = SPA_MINBLOCKSHIFT;
	max_bs = SPA_MAXBLOCKSHIFT;
	min_ibs = DN_MIN_INDBLKSHIFT;
	max_ibs = DN_MAX_INDBLKSHIFT;


	/*
	 * For i/o error checking, read the first and last level-0
	 * blocks (if they are not aligned), and all the level-1 blocks.
	 */

	if (dn) {
		if (dn->dn_maxblkid == 0) {
			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
			if (err)
				goto out;
		} else {
			zio_t *zio = zio_root(dn->dn_objset->os_spa,
			    NULL, NULL, ZIO_FLAG_CANFAIL);

			/* first level-0 block */
			start = off >> dn->dn_datablkshift;
			if (P2PHASE(off, dn->dn_datablksz) ||
			    len < dn->dn_datablksz) {
				err = dmu_tx_check_ioerr(zio, dn, 0, start);
				if (err)
					goto out;
			}

			/* last level-0 block */
			end = (off+len-1) >> dn->dn_datablkshift;
			if (end != start &&
			    P2PHASE(off+len, dn->dn_datablksz)) {
				err = dmu_tx_check_ioerr(zio, dn, 0, end);
				if (err)
					goto out;
			}

			/* level-1 blocks */
			if (dn->dn_nlevels > 1) {
				start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
				end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
				for (i = start+1; i < end; i++) {
					err = dmu_tx_check_ioerr(zio, dn, 1, i);
					if (err)
						goto out;
				}
			}

			err = zio_wait(zio);
			if (err)
				goto out;
		}
	}
예제 #14
0
파일: dmu_object.c 프로젝트: 64116278/zfs
uint64_t
dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
{
	uint64_t object;
	uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
	    (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
	dnode_t *dn = NULL;
	int dn_slots = dnodesize >> DNODE_SHIFT;
	boolean_t restarted = B_FALSE;

	if (dn_slots == 0) {
		dn_slots = DNODE_MIN_SLOTS;
	} else {
		ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
		ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
	}

	mutex_enter(&os->os_obj_lock);
	for (;;) {
		object = os->os_obj_next;
		/*
		 * Each time we polish off a L1 bp worth of dnodes (2^12
		 * objects), move to another L1 bp that's still
		 * reasonably sparse (at most 1/4 full). Look from the
		 * beginning at most once per txg. If we still can't
		 * allocate from that L1 block, search for an empty L0
		 * block, which will quickly skip to the end of the
		 * metadnode if the no nearby L0 blocks are empty. This
		 * fallback avoids a pathology where full dnode blocks
		 * containing large dnodes appear sparse because they
		 * have a low blk_fill, leading to many failed
		 * allocation attempts. In the long term a better
		 * mechanism to search for sparse metadnode regions,
		 * such as spacemaps, could be implemented.
		 *
		 * os_scan_dnodes is set during txg sync if enough objects
		 * have been freed since the previous rescan to justify
		 * backfilling again.
		 *
		 * Note that dmu_traverse depends on the behavior that we use
		 * multiple blocks of the dnode object before going back to
		 * reuse objects.  Any change to this algorithm should preserve
		 * that property or find another solution to the issues
		 * described in traverse_visitbp.
		 */
		if (P2PHASE(object, L1_dnode_count) == 0) {
			uint64_t offset;
			uint64_t blkfill;
			int minlvl;
			int error;
			if (os->os_rescan_dnodes) {
				offset = 0;
				os->os_rescan_dnodes = B_FALSE;
			} else {
				offset = object << DNODE_SHIFT;
			}
			blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
			minlvl = restarted ? 1 : 2;
			restarted = B_TRUE;
			error = dnode_next_offset(DMU_META_DNODE(os),
			    DNODE_FIND_HOLE, &offset, minlvl, blkfill, 0);
			if (error == 0)
				object = offset >> DNODE_SHIFT;
		}
		os->os_obj_next = object + dn_slots;

		/*
		 * XXX We should check for an i/o error here and return
		 * up to our caller.  Actually we should pre-read it in
		 * dmu_tx_assign(), but there is currently no mechanism
		 * to do so.
		 */
		(void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
		    FTAG, &dn);
		if (dn)
			break;

		if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
			os->os_obj_next = object;
		else
			/*
			 * Skip to next known valid starting point for a dnode.
			 */
			os->os_obj_next = P2ROUNDUP(object + 1,
			    DNODES_PER_BLOCK);
	}
예제 #15
0
/*
 * This routine assumes that the stack grows downward.
 * Returns 0 on success, errno on failure.
 */
int
grow_internal(caddr_t sp, uint_t growszc)
{
	struct proc *p = curproc;
	size_t newsize;
	size_t oldsize;
	int    error;
	size_t pgsz;
	uint_t szc;
	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);

	ASSERT(sp < p->p_usrstack);
	sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);

	/*
	 * grow to growszc alignment but use current p->p_stkpageszc for
	 * the segvn_crargs szc passed to segvn_create. For memcntl to
	 * increase the szc, this allows the new extension segment to be
	 * concatenated successfully with the existing stack segment.
	 */
	if ((szc = growszc) != 0) {
		pgsz = page_get_pagesize(szc);
		ASSERT(pgsz > PAGESIZE);
		newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
		if (newsize > (size_t)p->p_stk_ctl) {
			szc = 0;
			pgsz = PAGESIZE;
			newsize = p->p_usrstack - sp;
		}
	} else {
		pgsz = PAGESIZE;
		newsize = p->p_usrstack - sp;
	}

	if (newsize > (size_t)p->p_stk_ctl) {
		(void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
		    RCA_UNSAFE_ALL);

		return (ENOMEM);
	}

	oldsize = p->p_stksize;
	ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);

	if (newsize <= oldsize) {	/* prevent the stack from shrinking */
		return (0);
	}

	if (!(p->p_stkprot & PROT_EXEC)) {
		crargs.prot &= ~PROT_EXEC;
	}
	/*
	 * extend stack with the proposed new growszc, which is different
	 * than p_stkpageszc only on a memcntl to increase the stack pagesize.
	 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
	 * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
	 * if not aligned to szc's pgsz.
	 */
	if (szc > 0) {
		caddr_t oldsp = p->p_usrstack - oldsize;
		caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
		    pgsz);

		if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
			crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
			    AS_MAP_NO_LPOOB;
		} else if (oldsp == austk) {
			crargs.szc = szc;
		} else {
			crargs.szc = AS_MAP_STACK;
		}
	} else {
		crargs.szc = AS_MAP_NO_LPOOB;
	}
	crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;

	if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
	    segvn_create, &crargs)) != 0) {
		if (error == EAGAIN) {
			cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
			    "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
		}
		return (error);
	}
	p->p_stksize = newsize;
	return (0);
}
예제 #16
0
파일: dmu_tx.c 프로젝트: don-brady/zfs
/* ARGSUSED */
static void
dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
{
	dnode_t *dn = txh->txh_dnode;
	int err = 0;

	if (len == 0)
		return;

	(void) refcount_add_many(&txh->txh_space_towrite, len, FTAG);

	if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
		err = SET_ERROR(EFBIG);

	if (dn == NULL)
		return;

	/*
	 * For i/o error checking, read the blocks that will be needed
	 * to perform the write: the first and last level-0 blocks (if
	 * they are not aligned, i.e. if they are partial-block writes),
	 * and all the level-1 blocks.
	 */
	if (dn->dn_maxblkid == 0) {
		if (off < dn->dn_datablksz &&
		    (off > 0 || len < dn->dn_datablksz)) {
			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
			if (err != 0) {
				txh->txh_tx->tx_err = err;
			}
		}
	} else {
		zio_t *zio = zio_root(dn->dn_objset->os_spa,
		    NULL, NULL, ZIO_FLAG_CANFAIL);

		/* first level-0 block */
		uint64_t start = off >> dn->dn_datablkshift;
		if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
			err = dmu_tx_check_ioerr(zio, dn, 0, start);
			if (err != 0) {
				txh->txh_tx->tx_err = err;
			}
		}

		/* last level-0 block */
		uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
		if (end != start && end <= dn->dn_maxblkid &&
		    P2PHASE(off + len, dn->dn_datablksz)) {
			err = dmu_tx_check_ioerr(zio, dn, 0, end);
			if (err != 0) {
				txh->txh_tx->tx_err = err;
			}
		}

		/* level-1 blocks */
		if (dn->dn_nlevels > 1) {
			int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
			for (uint64_t i = (start >> shft) + 1;
			    i < end >> shft; i++) {
				err = dmu_tx_check_ioerr(zio, dn, 1, i);
				if (err != 0) {
					txh->txh_tx->tx_err = err;
				}
			}
		}

		err = zio_wait(zio);
		if (err != 0) {
			txh->txh_tx->tx_err = err;
		}
	}