Example #1
0
/*
 * __wt_btree_stat_init --
 *	Initialize the Btree statistics.
 */
int
__wt_btree_stat_init(WT_SESSION_IMPL *session, uint32_t flags)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_DSRC_STATS *stats;
	WT_PAGE *page;

	btree = S2BT(session);
	bm = btree->bm;
	stats = &btree->dhandle->stats;

	WT_RET(bm->stat(bm, session, stats));

	WT_STAT_SET(stats, btree_fixed_len, btree->bitcnt);
	WT_STAT_SET(stats, btree_maximum_depth, btree->maximum_depth);
	WT_STAT_SET(stats, btree_maxintlitem, btree->maxintlitem);
	WT_STAT_SET(stats, btree_maxintlpage, btree->maxintlpage);
	WT_STAT_SET(stats, btree_maxleafitem, btree->maxleafitem);
	WT_STAT_SET(stats, btree_maxleafpage, btree->maxleafpage);

	page = NULL;
	if (LF_ISSET(WT_STATISTICS_FAST))
		return (0);

	while ((ret = __wt_tree_walk(session, &page, 0)) == 0 && page != NULL)
		WT_RET(__stat_page(session, page, stats));
	return (ret == WT_NOTFOUND ? 0 : ret);
}
Example #2
0
/*
 * __wt_rec_track_wrapup_err --
 *	Resolve the page's list of tracked objects after an error occurs.
 */
int
__wt_rec_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_BM *bm;
	WT_DECL_RET;
	WT_PAGE_MODIFY *mod;
	WT_PAGE_TRACK *track;
	uint32_t i;

	bm = session->btree->bm;

	/*
	 * After a failed reconciliation of a page, discard entries added in the
	 * current reconciliation, their information is incorrect, additionally,
	 * clear the in-use flag in preparation for the next reconciliation.
	 */
	mod = page->modify;
	for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i)
		if (F_ISSET(track, WT_TRK_JUST_ADDED)) {
			/*
			 * The in-use flag is used to avoid discarding backing
			 * blocks: if an object is both just-added and in-use,
			 * we allocated the blocks on this run, and we want to
			 * discard them on error.
			 */
			if (F_ISSET(track, WT_TRK_INUSE))
				WT_TRET(bm->free(bm, session,
				    track->addr.addr, track->addr.size));

			__wt_free(session, track->addr.addr);
			memset(track, 0, sizeof(*track));
		} else
			F_CLR(track, WT_TRK_INUSE);
	return (ret);
}
Example #3
0
/*
 * __meta_track_apply --
 *	Apply the changes in a metadata tracking record.
 */
static int
__meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_RET;

	switch (trk->op) {
	case WT_ST_EMPTY:	/* Unused slot */
		break;
	case WT_ST_CHECKPOINT:	/* Checkpoint, see above */
		btree = trk->dhandle->handle;
		bm = btree->bm;
		WT_WITH_DHANDLE(session, trk->dhandle,
		    ret = bm->checkpoint_resolve(bm, session));
		break;
	case WT_ST_DROP_COMMIT:
		if ((ret =
		    __wt_block_manager_drop(session, trk->a, false)) != 0)
			__wt_err(session, ret,
			    "metadata remove dropped file %s", trk->a);
		break;
	case WT_ST_LOCK:
		WT_WITH_DHANDLE(session, trk->dhandle,
		    ret = __wt_session_release_btree(session));
		break;
	case WT_ST_FILEOP:
	case WT_ST_REMOVE:
	case WT_ST_SET:
		break;
	}

	__meta_track_clear(session, trk);
	return (ret);
}
Example #4
0
/*
 * __wt_btree_stat_init --
 *	Initialize the Btree statistics.
 */
int
__wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DSRC_STATS **stats;

	btree = S2BT(session);
	bm = btree->bm;
	stats = btree->dhandle->stats;

	WT_RET(bm->stat(bm, session, stats[0]));

	WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt);
	WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth);
	WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey);
	WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage);
	WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey);
	WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage);
	WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue);

	WT_STAT_SET(session, stats, cache_bytes_inuse,
	    __wt_btree_bytes_inuse(session));

	if (F_ISSET(cst, WT_STAT_TYPE_CACHE_WALK))
		__wt_curstat_cache_walk(session);

	if (F_ISSET(cst, WT_STAT_TYPE_TREE_WALK))
		WT_RET(__stat_tree_walk(session));

	return (0);
}
Example #5
0
/*
 * __wt_compact_page_skip --
 *	Return if the block-manager wants us to re-write this page.
 */
int
__wt_compact_page_skip(
    WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref, int *skipp)
{
	WT_BM *bm;
	uint32_t addr_size;
	const uint8_t *addr;

	bm = S2BT(session)->bm;

	/*
	 * There's one compaction test we do before we read the page, to see
	 * if the block-manager thinks it useful to rewrite the page.  If a
	 * rewrite won't help, we don't want to do I/O for nothing.  For that
	 * reason, this check is done in a call from inside the tree-walking
	 * routine.
	 *
	 * Ignore everything but on-disk pages, we've already done a pass over
	 * the in-memory pages.
	 */
	if (ref->state != WT_REF_DISK) {
		*skipp = 1;
		return (0);
	}

	__wt_get_addr(parent, ref, &addr, &addr_size);
	if (addr == NULL) {
		*skipp = 1;
		return (0);
	}

	return (bm->compact_page_skip(bm, session, addr, addr_size, skipp));
}
Example #6
0
/*
 * __wt_compact_page_skip --
 *	Return if compaction requires we read this page.
 */
int
__wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
    WT_BM *bm;
    size_t addr_size;
    u_int type;
    const uint8_t *addr;

    *skipp = false;				/* Default to reading. */
    type = 0;				/* Keep compiler quiet. */

    bm = S2BT(session)->bm;

    /*
     * We aren't holding a hazard pointer, so we can't look at the page
     * itself, all we can look at is the WT_REF information.  If there's no
     * address, the page isn't on disk, but we have to read internal pages
     * to walk the tree regardless; throw up our hands and read it.
     */
    __wt_ref_info(ref, &addr, &addr_size, &type);
    if (addr == NULL)
        return (0);

    /*
     * Internal pages must be read to walk the tree; ask the block-manager
     * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite
     * won't help.
     */
    return (type == WT_CELL_ADDR_INT ? 0 :
            bm->compact_page_skip(bm, session, addr, addr_size, skipp));
}
Example #7
0
/*
 * __compact_end --
 *	End object compaction.
 */
static int
__compact_end(WT_SESSION_IMPL *session)
{
	WT_BM *bm;

	bm = S2BT(session)->bm;
	return (bm->compact_end(bm, session));
}
Example #8
0
/*
 * __wt_compact --
 *	Compact a file.
 */
int
__wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_BM *bm;
	WT_CONFIG_ITEM cval;
	WT_DECL_RET;
	WT_PAGE *page;
	int trigger, skip;

	bm = S2BT(session)->bm;

	WT_DSTAT_INCR(session, session_compact);

	WT_RET(__wt_config_gets(session, cfg, "trigger", &cval));
	trigger = (int)cval.val;

	/* Check if compaction might be useful. */
	WT_RET(bm->compact_skip(bm, session, trigger, &skip));
	if (skip)
		return (0);

	/*
	 * Walk the cache reviewing in-memory pages to see if they need to be
	 * re-written.  This requires looking at page reconciliation results,
	 * which means the page cannot be reconciled at the same time as it's
	 * being reviewed for compaction.  The underlying functions ensure we
	 * don't collide with page eviction, but we need to make sure we don't
	 * collide with checkpoints either, they are the other operation that
	 * can reconcile a page.
	 */
	__wt_spin_lock(session, &S2C(session)->metadata_lock);
	WT_RET(__wt_bt_cache_op(session, NULL, WT_SYNC_COMPACT));
	__wt_spin_unlock(session, &S2C(session)->metadata_lock);

	/*
	 * Walk the tree, reviewing on-disk pages to see if they need to be
	 * re-written.
	 */
	for (page = NULL;;) {
		WT_RET(__wt_tree_walk(session, &page, WT_TREE_COMPACT));
		if (page == NULL)
			break;

		/*
		 * The only pages returned by the tree walk function are pages
		 * we want to re-write; mark the page and tree dirty.
		 */
		if ((ret = __wt_page_modify_init(session, page)) != 0) {
			WT_TRET(__wt_page_release(session, page));
			WT_RET(ret);
		}
		__wt_page_and_tree_modify_set(session, page);

		WT_DSTAT_INCR(session, btree_compact_rewrite);
	}

	return (0);
}
Example #9
0
/*
 * __ovfl_reuse_wrapup_err --
 *	Resolve the page's overflow reuse list after an error occurs.
 */
static int
__ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_BM *bm;
	WT_DECL_RET;
	WT_OVFL_REUSE **e, **head, *reuse;
	size_t decr;
	int i;

	bm = S2BT(session)->bm;
	head = page->modify->ovfl_track->ovfl_reuse;

	/*
	 * Discard any overflow records that were just added, freeing underlying
	 * blocks.
	 *
	 * First, walk the overflow reuse lists (except for the lowest one),
	 * fixing up skiplist links.
	 */
	for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
		for (e = &head[i]; (reuse = *e) != NULL;) {
			if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) {
				e = &reuse->next[i];
				continue;
			}
			*e = reuse->next[i];
		}

	/*
	 * Second, discard any overflow record with a just-added flag, clear the
	 * flags for the next run.
	 */
	decr = 0;
	for (e = &head[0]; (reuse = *e) != NULL;) {
		if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) {
			F_CLR(reuse, WT_OVFL_REUSE_INUSE);
			e = &reuse->next[0];
			continue;
		}
		*e = reuse->next[0];

		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
			WT_RET(
			    __ovfl_reuse_verbose(session, page, reuse, "free"));

		WT_TRET(bm->free(
		    bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size));
		decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE);
		__wt_free(session, reuse);
	}

	if (decr != 0)
		__wt_cache_page_inmem_decr(session, page, decr);
	return (0);
}
Example #10
0
/*
 * __compact_rewrite --
 *	Return if a page needs to be re-written.
 */
static int
__compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
	WT_BM *bm;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	size_t addr_size;
	const uint8_t *addr;

	*skipp = true;					/* Default skip. */

	bm = S2BT(session)->bm;
	page = ref->page;
	mod = page->modify;

	/*
	 * Ignore the root: it may not have a replacement address, and besides,
	 * if anything else gets written, so will it.
	 */
	if (__wt_ref_is_root(ref))
		return (0);

	/* Ignore currently dirty pages, they will be written regardless. */
	if (__wt_page_is_modified(page))
		return (0);

	/*
	 * If the page is clean, test the original addresses.
	 * If the page is a 1-to-1 replacement, test the replacement addresses.
	 * Ignore empty pages, they get merged into the parent.
	 */
	if (mod == NULL || mod->rec_result == 0) {
		WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
		if (addr == NULL)
			return (0);
		WT_RET(
		    bm->compact_page_skip(bm, session, addr, addr_size, skipp));
	} else if (mod->rec_result == WT_PM_REC_REPLACE) {
		/*
		 * The page's modification information can change underfoot if
		 * the page is being reconciled, serialize with reconciliation.
		 */
		WT_RET(__wt_fair_lock(session, &page->page_lock));

		ret = bm->compact_page_skip(bm, session,
		    mod->mod_replace.addr, mod->mod_replace.size, skipp);

		WT_TRET(__wt_fair_unlock(session, &page->page_lock));
		WT_RET(ret);
	}
	return (0);
}
Example #11
0
/*
 * __wt_btree_stat_init --
 *	Initialize the Btree statistics.
 */
int
__wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_DSRC_STATS **stats;
	WT_REF *next_walk;

	btree = S2BT(session);
	bm = btree->bm;
	stats = btree->dhandle->stats;

	WT_RET(bm->stat(bm, session, stats[0]));

	WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt);
	WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth);
	WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey);
	WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage);
	WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey);
	WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage);
	WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue);

	/* Everything else is really, really expensive. */
	if (!F_ISSET(cst, WT_CONN_STAT_ALL))
		return (0);

	/*
	 * Clear the statistics we're about to count.
	 */
	WT_STAT_SET(session, stats, btree_column_deleted, 0);
	WT_STAT_SET(session, stats, btree_column_fix, 0);
	WT_STAT_SET(session, stats, btree_column_internal, 0);
	WT_STAT_SET(session, stats, btree_column_rle, 0);
	WT_STAT_SET(session, stats, btree_column_variable, 0);
	WT_STAT_SET(session, stats, btree_entries, 0);
	WT_STAT_SET(session, stats, btree_overflow, 0);
	WT_STAT_SET(session, stats, btree_row_internal, 0);
	WT_STAT_SET(session, stats, btree_row_leaf, 0);

	next_walk = NULL;
	while ((ret = __wt_tree_walk(
	    session, &next_walk, 0)) == 0 && next_walk != NULL) {
		WT_WITH_PAGE_INDEX(session,
		    ret = __stat_page(session, next_walk->page, stats));
		WT_RET(ret);
	}
	return (ret == WT_NOTFOUND ? 0 : ret);
}
Example #12
0
/*
 * __wt_addr_string --
 *	Load a buffer with a printable, nul-terminated representation of an
 * address.
 */
const char *
__wt_addr_string(
    WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, uint32_t size)
{
	WT_BM *bm;

	bm = S2BT(session)->bm;

	if (addr == NULL) {
		buf->data = "[NoAddr]";
		buf->size = WT_STORE_SIZE(strlen("[NoAddr]"));
	} else if (bm->addr_string(bm, session, buf, addr, size) != 0) {
		buf->data = "[Error]";
		buf->size = WT_STORE_SIZE(strlen("[Error]"));
	}
	return (buf->data);
}
Example #13
0
/*
 * __wt_ovfl_discard --
 *	Discard an on-page overflow value, and reset the page's cell.
 */
int
__wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_DECL_RET;

	btree = S2BT(session);
	bm = btree->bm;
	unpack = &_unpack;

	__wt_cell_unpack(cell, unpack);

	/*
	 * Finally remove overflow key/value objects, called when reconciliation
	 * finishes after successfully writing a page.
	 *
	 * Keys must have already been instantiated and value objects must have
	 * already been cached (if they might potentially still be read by any
	 * running transaction).
	 *
	 * Acquire the overflow lock to avoid racing with a thread reading the
	 * backing overflow blocks.
	 */
	WT_RET(__wt_writelock(session, btree->ovfl_lock));

	switch (unpack->raw) {
	case WT_CELL_KEY_OVFL:
		__wt_cell_type_reset(session,
		    unpack->cell, WT_CELL_KEY_OVFL, WT_CELL_KEY_OVFL_RM);
		break;
	case WT_CELL_VALUE_OVFL:
		__wt_cell_type_reset(session,
		    unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM);
		break;
	WT_ILLEGAL_VALUE(session);
	}

	WT_TRET(__wt_writeunlock(session, btree->ovfl_lock));

	/* Free the backing disk blocks. */
	WT_TRET(bm->free(bm, session, unpack->data, unpack->size));

	return (ret);
}
Example #14
0
/*
 * __wt_debug_addr --
 *	Read and dump a disk page in debugging mode, using an addr/size pair.
 */
int
__wt_debug_addr(WT_SESSION_IMPL *session,
    const uint8_t *addr, size_t addr_size, const char *ofile)
{
	WT_BM *bm;
	WT_DECL_ITEM(buf);
	WT_DECL_RET;

	bm = S2BT(session)->bm;

	WT_RET(__wt_scr_alloc(session, 1024, &buf));
	WT_ERR(bm->read(bm, session, buf, addr, addr_size));
	ret = __wt_debug_disk(session, buf->mem, ofile);

err:	__wt_scr_free(session, &buf);
	return (ret);
}
Example #15
0
/*
 * __wt_addr_string --
 *	Load a buffer with a printable, nul-terminated representation of an
 * address.
 */
const char *
__wt_addr_string(WT_SESSION_IMPL *session,
    const uint8_t *addr, size_t addr_size, WT_ITEM *buf)
{
	WT_BM *bm;
	WT_BTREE *btree;

	btree = S2BT_SAFE(session);

	if (addr == NULL) {
		buf->data = "[NoAddr]";
		buf->size = strlen("[NoAddr]");
	} else if (btree == NULL || (bm = btree->bm) == NULL ||
	    bm->addr_string(bm, session, buf, addr, addr_size) != 0) {
		buf->data = "[Error]";
		buf->size = strlen("[Error]");
	}
	return (buf->data);
}
Example #16
0
/*
 * __cursor_size_chk --
 *	Return if an inserted item is too large.
 */
static inline int
__cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_RET;
	size_t size;

	btree = S2BT(session);
	bm = btree->bm;

	if (btree->type == BTREE_COL_FIX) {
		/* Fixed-size column-stores take a single byte. */
		if (kv->size != 1)
			WT_RET_MSG(session, EINVAL,
			    "item size of %" WT_SIZET_FMT " does not match "
			    "fixed-length file requirement of 1 byte",
			    kv->size);
		return (0);
	}

	/* Don't waste effort, 1GB is always cool. */
	if (kv->size <= WT_GIGABYTE)
		return (0);

	/*
	 * There are two checks: what we are willing to store in the tree, and
	 * what the block manager can actually write.
	 */
	if (kv->size > WT_BTREE_MAX_OBJECT_SIZE)
		ret = EINVAL;
	else {
		size = kv->size;
		ret = bm->write_size(bm, session, &size);
	}
	if (ret != 0)
		WT_RET_MSG(session, ret,
		    "item size of %" WT_SIZET_FMT " exceeds the maximum "
		    "supported size",
		    kv->size);
	return (0);
}
Example #17
0
/*
 * __wt_block_manager_open --
 *	Open a file.
 */
int
__wt_block_manager_open(WT_SESSION_IMPL *session,
    const char *filename, const char *cfg[], int forced_salvage, WT_BM **bmp)
{
	WT_BM *bm;
	WT_DECL_RET;

	*bmp = NULL;

	WT_RET(__wt_calloc_def(session, 1, &bm));
	__bm_method_set(bm, 0);

	WT_ERR(__wt_block_open(
	    session, filename, cfg, forced_salvage, &bm->block));

	*bmp = bm;
	return (0);

err:	WT_TRET(bm->close(bm, session));
	return (ret);
}
Example #18
0
/*
 * __wt_block_manager_open --
 *	Open a file.
 */
int
__wt_block_manager_open(WT_SESSION_IMPL *session,
    const char *filename, const char *cfg[],
    bool forced_salvage, bool readonly, uint32_t allocsize, WT_BM **bmp)
{
	WT_BM *bm;
	WT_DECL_RET;

	*bmp = NULL;

	WT_RET(__wt_calloc_one(session, &bm));
	__bm_method_set(bm, false);

	WT_ERR(__wt_block_open(session, filename, cfg,
	    forced_salvage, readonly, allocsize, &bm->block));

	*bmp = bm;
	return (0);

err:	WT_TRET(bm->close(bm, session));
	return (ret);
}
Example #19
0
/*将多余的文件空间compact到合适的位置,如果ref在compact范围内,返回skip = 1,表示文件空间不能进行compact*/
static int __compact_rewrite(WT_SESSION_IMPL* session, WT_REF* ref, int* skipp)
{
	WT_BM *bm;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	size_t addr_size;
	const uint8_t *addr;

	*skipp = 1;	

	bm = S2BT(session)->bm;
	page = ref->page;
	mod = page->modify;

	/*root page是不能被compact*/
	if (__wt_ref_is_root(ref))
		return 0;

	/*ref指向的是个脏页,不进行compact*/
	if (__wt_page_is_modified(page))
		return (0);

	/*假如page一已经被清空的,直接判断是否可以它的block空间compact*/
	if (mod == NULL || F_ISSET(mod, WT_PM_REC_MASK) == 0) {
		WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
		if (addr == NULL)
			return (0);
		WT_RET(bm->compact_page_skip(bm, session, addr, addr_size, skipp));
	}
	else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE){ /*如果page空间是替换,那么进行替换block的compact操作判断*/
		WT_PAGE_LOCK(session, page);
		ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp);
		WT_PAGE_UNLOCK(session, page);
		WT_RET(ret);
	}

	return 0;
}
Example #20
0
/*在读取ref对应的page时,检查它是否需要compact*/
int __wt_compact_page_skip(WT_SESSION_IMPL* session, WT_REF* ref, int* skipp)
{
	WT_BM *bm;
	size_t addr_size;
	u_int type;
	const uint8_t *addr;

	*skipp = 0;				
	type = 0;

	bm = S2BT(session)->bm;

	/*
	* We aren't holding a hazard pointer, so we can't look at the page
	* itself, all we can look at is the WT_REF information.  If there's no
	* address, the page isn't on disk, but we have to read internal pages
	* to walk the tree regardless; throw up our hands and read it.
	*/
	WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, &type));
	if (addr == NULL)
		return 0;

	return (type == WT_CELL_ADDR_INT ? 0 : bm->compact_page_skip(bm, session, addr, addr_size, skipp));
}
Example #21
0
/*
 * __wt_bt_write --
 *	Write a buffer into a block, returning the block's addr/size and
 * checksum.
 */
int
__wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
    uint8_t *addr, size_t *addr_sizep, bool checkpoint, bool compressed)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_ITEM *ip;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_PAGE_HEADER *dsk;
	size_t dst_len, len, result_len, size, src_len;
	int compression_failed;		/* Extension API, so not a bool. */
	uint8_t *dst, *src;
	bool data_cksum;

	btree = S2BT(session);
	bm = btree->bm;

	/* Checkpoint calls are different than standard calls. */
	WT_ASSERT(session,
	    (!checkpoint && addr != NULL && addr_sizep != NULL) ||
	    (checkpoint && addr == NULL && addr_sizep == NULL));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * We're passed a table's disk image.  Decompress if necessary and
	 * verify the image.  Always check the in-memory length for accuracy.
	 */
	dsk = buf->mem;
	if (compressed) {
		WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &tmp));

		memcpy(tmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP);
		WT_ERR(btree->compressor->decompress(
		    btree->compressor, &session->iface,
		    (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP,
		    buf->size - WT_BLOCK_COMPRESS_SKIP,
		    (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP,
		    tmp->memsize - WT_BLOCK_COMPRESS_SKIP,
		    &result_len));
		WT_ASSERT(session,
		    dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP);
		tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP;
		ip = tmp;
	} else {
		WT_ASSERT(session, dsk->mem_size == buf->size);
		ip = buf;
	}
	WT_ERR(__wt_verify_dsk(session, "[write-check]", ip));
	__wt_scr_free(session, &tmp);
#endif

	/*
	 * Optionally stream-compress the data, but don't compress blocks that
	 * are already as small as they're going to get.
	 */
	if (btree->compressor == NULL ||
	    btree->compressor->compress == NULL || compressed)
		ip = buf;
	else if (buf->size <= btree->allocsize) {
		ip = buf;
		WT_STAT_FAST_DATA_INCR(session, compress_write_too_small);
	} else {
		/* Skip the header bytes of the source data. */
		src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP;
		src_len = buf->size - WT_BLOCK_COMPRESS_SKIP;

		/*
		 * Compute the size needed for the destination buffer.  We only
		 * allocate enough memory for a copy of the original by default,
		 * if any compressed version is bigger than the original, we
		 * won't use it.  However, some compression engines (snappy is
		 * one example), may need more memory because they don't stop
		 * just because there's no more memory into which to compress.
		 */
		if (btree->compressor->pre_size == NULL)
			len = src_len;
		else
			WT_ERR(btree->compressor->pre_size(btree->compressor,
			    &session->iface, src, src_len, &len));

		size = len + WT_BLOCK_COMPRESS_SKIP;
		WT_ERR(bm->write_size(bm, session, &size));
		WT_ERR(__wt_scr_alloc(session, size, &tmp));

		/* Skip the header bytes of the destination data. */
		dst = (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP;
		dst_len = len;

		compression_failed = 0;
		WT_ERR(btree->compressor->compress(btree->compressor,
		    &session->iface,
		    src, src_len,
		    dst, dst_len,
		    &result_len, &compression_failed));
		result_len += WT_BLOCK_COMPRESS_SKIP;

		/*
		 * If compression fails, or doesn't gain us at least one unit of
		 * allocation, fallback to the original version.  This isn't
		 * unexpected: if compression doesn't work for some chunk of
		 * data for some reason (noting likely additional format/header
		 * information which compressed output requires), it just means
		 * the uncompressed version is as good as it gets, and that's
		 * what we use.
		 */
		if (compression_failed ||
		    buf->size / btree->allocsize <=
		    result_len / btree->allocsize) {
			ip = buf;
			WT_STAT_FAST_DATA_INCR(session, compress_write_fail);
		} else {
			compressed = true;
			WT_STAT_FAST_DATA_INCR(session, compress_write);

			/*
			 * Copy in the skipped header bytes, set the final data
			 * size.
			 */
			memcpy(tmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP);
			tmp->size = result_len;
			ip = tmp;
		}
	}
	dsk = ip->mem;

	/* If the buffer is compressed, set the flag. */
	if (compressed)
		F_SET(dsk, WT_PAGE_COMPRESSED);

	/*
	 * We increment the block's write generation so it's easy to identify
	 * newer versions of blocks during salvage.  (It's common in WiredTiger,
	 * at least for the default block manager, for multiple blocks to be
	 * internally consistent with identical first and last keys, so we need
	 * a way to know the most recent state of the block.  We could check
	 * which leaf is referenced by a valid internal page, but that implies
	 * salvaging internal pages, which I don't want to do, and it's not
	 * as good anyway, because the internal page may not have been written
	 * after the leaf page was updated.  So, write generations it is.
	 *
	 * Nothing is locked at this point but two versions of a page with the
	 * same generation is pretty unlikely, and if we did, they're going to
	 * be roughly identical for the purposes of salvage, anyway.
	 */
	dsk->write_gen = ++btree->write_gen;

	/*
	 * Checksum the data if the buffer isn't compressed or checksums are
	 * configured.
	 */
	switch (btree->checksum) {
	case CKSUM_ON:
		data_cksum = true;
		break;
	case CKSUM_OFF:
		data_cksum = false;
		break;
	case CKSUM_UNCOMPRESSED:
	default:
		data_cksum = !compressed;
		break;
	}

	/* Call the block manager to write the block. */
	WT_ERR(checkpoint ?
	    bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) :
	    bm->write(bm, session, ip, addr, addr_sizep, data_cksum));

	WT_STAT_FAST_CONN_INCR(session, cache_write);
	WT_STAT_FAST_DATA_INCR(session, cache_write);
	WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, dsk->mem_size);
	WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, dsk->mem_size);

err:	__wt_scr_free(session, &tmp);
	return (ret);
}
Example #22
0
File: bt_io.c Project: qihsh/mongo
/*
 * __wt_bt_read --
 *	Read a cookie referenced block into a buffer.
 */
int
__wt_bt_read(WT_SESSION_IMPL *session,
    WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_ITEM(etmp);
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_ENCRYPTOR *encryptor;
	WT_ITEM *ip;
	const WT_PAGE_HEADER *dsk;
	const char *fail_msg;
	size_t result_len;

	btree = S2BT(session);
	bm = btree->bm;
	fail_msg = NULL;			/* -Wuninitialized */

	/*
	 * If anticipating a compressed or encrypted block, read into a scratch
	 * buffer and decompress into the caller's buffer.  Else, read directly
	 * into the caller's buffer.
	 */
	if (btree->compressor == NULL && btree->kencryptor == NULL) {
		WT_RET(bm->read(bm, session, buf, addr, addr_size));
		dsk = buf->data;
		ip = NULL;
	} else {
		WT_RET(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->read(bm, session, tmp, addr, addr_size));
		dsk = tmp->data;
		ip = tmp;
	}

	/*
	 * If the block is encrypted, copy the skipped bytes of the original
	 * image into place, then decrypt.
	 */
	if (F_ISSET(dsk, WT_PAGE_ENCRYPTED)) {
		if (btree->kencryptor == NULL ||
		    (encryptor = btree->kencryptor->encryptor) == NULL ||
		    encryptor->decrypt == NULL) {
			fail_msg =
			    "encrypted block in file for which no encryption "
			    "configured";
			goto corrupt;
		}

		WT_ERR(__wt_scr_alloc(session, 0, &etmp));
		if ((ret = __wt_decrypt(session,
		    encryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp)) != 0) {
			fail_msg = "block decryption failed";
			goto corrupt;
		}

		ip = etmp;
		dsk = ip->data;
	} else if (btree->kencryptor != NULL) {
		fail_msg =
		    "unencrypted block in file for which encryption configured";
		goto corrupt;
	}

	if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) {
		if (btree->compressor == NULL ||
		    btree->compressor->decompress == NULL) {
			fail_msg =
			    "compressed block in file for which no compression "
			    "configured";
			goto corrupt;
		}

		/*
		 * Size the buffer based on the in-memory bytes we're expecting
		 * from decompression.
		 */
		WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size));

		/*
		 * Note the source length is NOT the number of compressed bytes,
		 * it's the length of the block we just read (minus the skipped
		 * bytes).  We don't store the number of compressed bytes: some
		 * compression engines need that length stored externally, they
		 * don't have markers in the stream to signal the end of the
		 * compressed bytes.  Those engines must store the compressed
		 * byte length somehow, see the snappy compression extension for
		 * an example.
		 */
		memcpy(buf->mem, ip->data, WT_BLOCK_COMPRESS_SKIP);
		ret = btree->compressor->decompress(
		    btree->compressor, &session->iface,
		    (uint8_t *)ip->data + WT_BLOCK_COMPRESS_SKIP,
		    tmp->size - WT_BLOCK_COMPRESS_SKIP,
		    (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP,
		    dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len);

		/*
		 * If checksums were turned off because we're depending on the
		 * decompression to fail on any corrupted data, we'll end up
		 * here after corruption happens.  If we're salvaging the file,
		 * it's OK, otherwise it's really, really bad.
		 */
		if (ret != 0 ||
		    result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) {
			fail_msg = "block decryption failed";
			goto corrupt;
		}
	} else
		/*
		 * If we uncompressed above, the page is in the correct buffer.
		 * If we get here the data may be in the wrong buffer and the
		 * buffer may be the wrong size.  If needed, get the page
		 * into the destination buffer.
		 */
		if (ip != NULL)
			WT_ERR(__wt_buf_set(
			    session, buf, ip->data, dsk->mem_size));

	/* If the handle is a verify handle, verify the physical page. */
	if (F_ISSET(btree, WT_BTREE_VERIFY)) {
		if (tmp == NULL)
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
		WT_ERR(__wt_verify_dsk(session, tmp->data, buf));
	}

	WT_STAT_FAST_CONN_INCR(session, cache_read);
	WT_STAT_FAST_DATA_INCR(session, cache_read);
	if (F_ISSET(dsk, WT_PAGE_COMPRESSED))
		WT_STAT_FAST_DATA_INCR(session, compress_read);
	WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size);
	WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size);

	if (0) {
corrupt:	if (ret == 0)
			ret = WT_ERROR;
		if (!F_ISSET(btree, WT_BTREE_VERIFY) &&
		    !F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) {
			__wt_err(session, ret, "%s", fail_msg);
			ret = __wt_illegal_value(session, btree->dhandle->name);
		}
	}

err:	__wt_scr_free(session, &tmp);
	__wt_scr_free(session, &etmp);
	return (ret);
}
Example #23
0
/*
 * __verify_tree --
 *	Verify a tree, recursively descending through it in depth-first fashion.
 * The page argument was physically verified (so we know it's correctly formed),
 * and the in-memory version built.  Our job is to check logical relationships
 * in the page and in the tree.
 */
static int
__verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
{
	WT_BM *bm;
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_COL *cip;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_REF *child_ref;
	uint64_t recno;
	uint32_t entry, i;
	bool found;

	bm = S2BT(session)->bm;
	page = ref->page;

	unpack = &_unpack;
	WT_CLEAR(*unpack);	/* -Wuninitialized */

	WT_RET(__wt_verbose(session, WT_VERB_VERIFY, "%s %s",
	    __wt_page_addr_string(session, ref, vs->tmp1),
	    __wt_page_type_string(page->type)));

	/* Optionally dump the address. */
	if (vs->dump_address)
		WT_RET(__wt_msg(session, "%s %s",
		    __wt_page_addr_string(session, ref, vs->tmp1),
		    __wt_page_type_string(page->type)));

	/* Track the shape of the tree. */
	if (WT_PAGE_IS_INTERNAL(page))
		++vs->depth_internal[
		    WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)];
	else
		++vs->depth_leaf[
		    WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)];

	/*
	 * The page's physical structure was verified when it was read into
	 * memory by the read server thread, and then the in-memory version
	 * of the page was built. Now we make sure the page and tree are
	 * logically consistent.
	 *
	 * !!!
	 * The problem: (1) the read server has to build the in-memory version
	 * of the page because the read server is the thread that flags when
	 * any thread can access the page in the tree; (2) we can't build the
	 * in-memory version of the page until the physical structure is known
	 * to be OK, so the read server has to verify at least the physical
	 * structure of the page; (3) doing complete page verification requires
	 * reading additional pages (for example, overflow keys imply reading
	 * overflow pages in order to test the key's order in the page); (4)
	 * the read server cannot read additional pages because it will hang
	 * waiting on itself.  For this reason, we split page verification
	 * into a physical verification, which allows the in-memory version
	 * of the page to be built, and then a subsequent logical verification
	 * which happens here.
	 *
	 * Report progress occasionally.
	 */
#define	WT_VERIFY_PROGRESS_INTERVAL	100
	if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0)
		WT_RET(__wt_progress(session, NULL, vs->fcnt));

#ifdef HAVE_DIAGNOSTIC
	/* Optionally dump the blocks or page in debugging mode. */
	if (vs->dump_blocks)
		WT_RET(__wt_debug_disk(session, page->dsk, NULL));
	if (vs->dump_pages)
		WT_RET(__wt_debug_page(session, page, NULL));
#endif

	/*
	 * Column-store key order checks: check the page's record number and
	 * then update the total record count.
	 */
	switch (page->type) {
	case WT_PAGE_COL_FIX:
		recno = page->pg_fix_recno;
		goto recno_chk;
	case WT_PAGE_COL_INT:
		recno = page->pg_intl_recno;
		goto recno_chk;
	case WT_PAGE_COL_VAR:
		recno = page->pg_var_recno;
recno_chk:	if (recno != vs->record_total + 1)
			WT_RET_MSG(session, WT_ERROR,
			    "page at %s has a starting record of %" PRIu64
			    " when the expected starting record is %" PRIu64,
			    __wt_page_addr_string(session, ref, vs->tmp1),
			    recno, vs->record_total + 1);
		break;
	}
	switch (page->type) {
	case WT_PAGE_COL_FIX:
		vs->record_total += page->pg_fix_entries;
		break;
	case WT_PAGE_COL_VAR:
		recno = 0;
		WT_COL_FOREACH(page, cip, i)
			if ((cell = WT_COL_PTR(page, cip)) == NULL)
				++recno;
			else {
				__wt_cell_unpack(cell, unpack);
				recno += __wt_cell_rle(unpack);
			}
		vs->record_total += recno;
		break;
	}

	/*
	 * Row-store leaf page key order check: it's a depth-first traversal,
	 * the first key on this page should be larger than any key previously
	 * seen.
	 */
	switch (page->type) {
	case WT_PAGE_ROW_LEAF:
		WT_RET(__verify_row_leaf_key_order(session, ref, vs));
		break;
	}

	/* If it's not the root page, unpack the parent cell. */
	if (!__wt_ref_is_root(ref)) {
		__wt_cell_unpack(ref->addr, unpack);

		/* Compare the parent cell against the page type. */
		switch (page->type) {
		case WT_PAGE_COL_FIX:
			if (unpack->raw != WT_CELL_ADDR_LEAF_NO)
				goto celltype_err;
			break;
		case WT_PAGE_COL_VAR:
			if (unpack->raw != WT_CELL_ADDR_LEAF &&
			    unpack->raw != WT_CELL_ADDR_LEAF_NO)
				goto celltype_err;
			break;
		case WT_PAGE_ROW_LEAF:
			if (unpack->raw != WT_CELL_ADDR_DEL &&
			    unpack->raw != WT_CELL_ADDR_LEAF &&
			    unpack->raw != WT_CELL_ADDR_LEAF_NO)
				goto celltype_err;
			break;
		case WT_PAGE_COL_INT:
		case WT_PAGE_ROW_INT:
			if (unpack->raw != WT_CELL_ADDR_INT)
celltype_err:			WT_RET_MSG(session, WT_ERROR,
				    "page at %s, of type %s, is referenced in "
				    "its parent by a cell of type %s",
				    __wt_page_addr_string(
					session, ref, vs->tmp1),
				    __wt_page_type_string(page->type),
				    __wt_cell_type_string(unpack->raw));
			break;
		}
	}

	/*
	 * Check overflow pages.  We check overflow cells separately from other
	 * tests that walk the page as it's simpler, and I don't care much how
	 * fast table verify runs.
	 */
	switch (page->type) {
	case WT_PAGE_COL_VAR:
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		WT_RET(__verify_overflow_cell(session, ref, &found, vs));
		if (__wt_ref_is_root(ref) || page->type == WT_PAGE_ROW_INT)
			break;

		/*
		 * Object if a leaf-no-overflow address cell references a page
		 * with overflow keys, but don't object if a leaf address cell
		 * references a page without overflow keys.  Reconciliation
		 * doesn't guarantee every leaf page without overflow items will
		 * be a leaf-no-overflow type.
		 */
		if (found && unpack->raw == WT_CELL_ADDR_LEAF_NO)
			WT_RET_MSG(session, WT_ERROR,
			    "page at %s, of type %s and referenced in its "
			    "parent by a cell of type %s, contains overflow "
			    "items",
			    __wt_page_addr_string(session, ref, vs->tmp1),
			    __wt_page_type_string(page->type),
			    __wt_cell_type_string(WT_CELL_ADDR_LEAF_NO));
		break;
	}

	/* Check tree connections and recursively descend the tree. */
	switch (page->type) {
	case WT_PAGE_COL_INT:
		/* For each entry in an internal page, verify the subtree. */
		entry = 0;
		WT_INTL_FOREACH_BEGIN(session, page, child_ref) {
			/*
			 * It's a depth-first traversal: this entry's starting
			 * record number should be 1 more than the total records
			 * reviewed to this point.
			 */
			++entry;
			if (child_ref->key.recno != vs->record_total + 1) {
				WT_RET_MSG(session, WT_ERROR,
				    "the starting record number in entry %"
				    PRIu32 " of the column internal page at "
				    "%s is %" PRIu64 " and the expected "
				    "starting record number is %" PRIu64,
				    entry,
				    __wt_page_addr_string(
				    session, child_ref, vs->tmp1),
				    child_ref->key.recno,
				    vs->record_total + 1);
			}

			/* Verify the subtree. */
			++vs->depth;
			WT_RET(__wt_page_in(session, child_ref, 0));
			ret = __verify_tree(session, child_ref, vs);
			WT_TRET(__wt_page_release(session, child_ref, 0));
			--vs->depth;
			WT_RET(ret);

			__wt_cell_unpack(child_ref->addr, unpack);
			WT_RET(bm->verify_addr(
			    bm, session, unpack->data, unpack->size));
		} WT_INTL_FOREACH_END;
		break;
	case WT_PAGE_ROW_INT:
		/* For each entry in an internal page, verify the subtree. */
		entry = 0;
		WT_INTL_FOREACH_BEGIN(session, page, child_ref) {
			/*
			 * It's a depth-first traversal: this entry's starting
			 * key should be larger than the largest key previously
			 * reviewed.
			 *
			 * The 0th key of any internal page is magic, and we
			 * can't test against it.
			 */
			++entry;
			if (entry != 1)
				WT_RET(__verify_row_int_key_order(
				    session, page, child_ref, entry, vs));

			/* Verify the subtree. */
			++vs->depth;
			WT_RET(__wt_page_in(session, child_ref, 0));
			ret = __verify_tree(session, child_ref, vs);
			WT_TRET(__wt_page_release(session, child_ref, 0));
			--vs->depth;
			WT_RET(ret);

			__wt_cell_unpack(child_ref->addr, unpack);
			WT_RET(bm->verify_addr(
			    bm, session, unpack->data, unpack->size));
		} WT_INTL_FOREACH_END;
Example #24
0
/*
 * __wt_verify --
 *	Verify a file.
 */
int
__wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_CKPT *ckptbase, *ckpt;
	WT_DECL_RET;
	WT_VSTUFF *vs, _vstuff;
	size_t root_addr_size;
	uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE];
	bool bm_start, quit;

	btree = S2BT(session);
	bm = btree->bm;
	ckptbase = NULL;
	bm_start = false;

	WT_CLEAR(_vstuff);
	vs = &_vstuff;
	WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key));
	WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr));
	WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1));
	WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2));
	WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp3));
	WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp4));

	/* Check configuration strings. */
	WT_ERR(__verify_config(session, cfg, vs));

	/* Optionally dump specific block offsets. */
	WT_ERR(__verify_config_offsets(session, cfg, &quit));
	if (quit)
		goto done;

	/* Get a list of the checkpoints for this file. */
	WT_ERR(
	    __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase));

	/* Inform the underlying block manager we're verifying. */
	WT_ERR(bm->verify_start(bm, session, ckptbase, cfg));
	bm_start = true;

	/* Loop through the file's checkpoints, verifying each one. */
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		WT_ERR(__wt_verbose(session, WT_VERB_VERIFY,
		    "%s: checkpoint %s", btree->dhandle->name, ckpt->name));

		/* Fake checkpoints require no work. */
		if (F_ISSET(ckpt, WT_CKPT_FAKE))
			continue;

		/* House-keeping between checkpoints. */
		__verify_checkpoint_reset(vs);

		if (WT_VRFY_DUMP(vs))
			WT_ERR(__wt_msg(session, "%s: checkpoint %s",
			    btree->dhandle->name, ckpt->name));

		/* Load the checkpoint. */
		WT_ERR(bm->checkpoint_load(bm, session,
		    ckpt->raw.data, ckpt->raw.size,
		    root_addr, &root_addr_size, true));

		/*
		 * Ignore trees with no root page.
		 * Verify, then discard the checkpoint from the cache.
		 */
		if (root_addr_size != 0 &&
		    (ret = __wt_btree_tree_open(
		    session, root_addr, root_addr_size)) == 0) {
			if (WT_VRFY_DUMP(vs))
				WT_ERR(__wt_msg(session, "Root: %s %s",
				    __wt_addr_string(session,
				    root_addr, root_addr_size, vs->tmp1),
				    __wt_page_type_string(
				    btree->root.page->type)));

			WT_WITH_PAGE_INDEX(session,
			    ret = __verify_tree(session, &btree->root, vs));

			WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD));
		}

		/* Unload the checkpoint. */
		WT_TRET(bm->checkpoint_unload(bm, session));
		WT_ERR(ret);

		/* Display the tree shape. */
		if (vs->dump_shape)
			WT_ERR(__verify_tree_shape(session, vs));
	}
Example #25
0
/*
 * __ovfl_reuse_wrapup --
 *	Resolve the page's overflow reuse list after a page is written.
 */
static int
__ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_BM *bm;
	WT_OVFL_REUSE **e, **head, *reuse;
	size_t decr;
	int i;

	bm = S2BT(session)->bm;
	head = page->modify->ovfl_track->ovfl_reuse;

	/*
	 * Discard any overflow records that aren't in-use, freeing underlying
	 * blocks.
	 *
	 * First, walk the overflow reuse lists (except for the lowest one),
	 * fixing up skiplist links.
	 */
	for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
		for (e = &head[i]; (reuse = *e) != NULL;) {
			if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) {
				e = &reuse->next[i];
				continue;
			}
			*e = reuse->next[i];
		}

	/*
	 * Second, discard any overflow record without an in-use flag, clear
	 * the flags for the next run.
	 *
	 * As part of the pass through the lowest level, figure out how much
	 * space we added/subtracted from the page, and update its footprint.
	 * We don't get it exactly correct because we don't know the depth of
	 * the skiplist here, but it's close enough, and figuring out the
	 * memory footprint change in the reconciliation wrapup code means
	 * fewer atomic updates and less code overall.
	 */
	decr = 0;
	for (e = &head[0]; (reuse = *e) != NULL;) {
		if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) {
			F_CLR(reuse,
			    WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED);
			e = &reuse->next[0];
			continue;
		}
		*e = reuse->next[0];

		WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED));

		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
			WT_RET(
			    __ovfl_reuse_verbose(session, page, reuse, "free"));

		WT_RET(bm->free(
		    bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size));
		decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE);
		__wt_free(session, reuse);
	}

	if (decr != 0)
		__wt_cache_page_inmem_decr(session, page, decr);
	return (0);
}
Example #26
0
/*
 * __wt_compact --
 *	Compact a file.
 */
int
__wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
{
    WT_BM *bm;
    WT_BTREE *btree;
    WT_DECL_RET;
    WT_REF *ref;
    bool block_manager_begin, skip;

    WT_UNUSED(cfg);

    btree = S2BT(session);
    bm = btree->bm;
    ref = NULL;
    block_manager_begin = false;

    WT_STAT_FAST_DATA_INCR(session, session_compact);

    /*
     * Check if compaction might be useful -- the API layer will quit trying
     * to compact the data source if we make no progress, set a flag if the
     * block layer thinks compaction is possible.
     */
    WT_RET(bm->compact_skip(bm, session, &skip));
    if (skip)
        return (0);

    /*
     * Reviewing in-memory pages requires looking at page reconciliation
     * results, because we care about where the page is stored now, not
     * where the page was stored when we first read it into the cache.
     * We need to ensure we don't race with page reconciliation as it's
     * writing the page modify information.
     *
     * There are three ways we call reconciliation: checkpoints, threads
     * writing leaf pages (usually in preparation for a checkpoint or if
     * closing a file), and eviction.
     *
     * We're holding the schema lock which serializes with checkpoints.
     */
    WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));

    /*
     * Get the tree handle's flush lock which blocks threads writing leaf
     * pages.
     */
    __wt_spin_lock(session, &btree->flush_lock);

    /* Start compaction. */
    WT_ERR(bm->compact_start(bm, session));
    block_manager_begin = true;

    /* Walk the tree reviewing pages to see if they should be re-written. */
    for (;;) {
        /*
         * Pages read for compaction aren't "useful"; don't update the
         * read generation of pages already in memory, and if a page is
         * read, set its generation to a low value so it is evicted
         * quickly.
         */
        WT_ERR(__wt_tree_walk(session, &ref,
                              WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED));
        if (ref == NULL)
            break;

        WT_ERR(__compact_rewrite(session, ref, &skip));
        if (skip)
            continue;

        session->compact_state = WT_COMPACT_SUCCESS;

        /* Rewrite the page: mark the page and tree dirty. */
        WT_ERR(__wt_page_modify_init(session, ref->page));
        __wt_page_modify_set(session, ref->page);

        WT_STAT_FAST_DATA_INCR(session, btree_compact_rewrite);
    }

err:
    if (ref != NULL)
        WT_TRET(__wt_page_release(session, ref, 0));

    if (block_manager_begin)
        WT_TRET(bm->compact_end(bm, session));

    /* Unblock threads writing leaf pages. */
    __wt_spin_unlock(session, &btree->flush_lock);

    return (ret);
}
Example #27
0
/*
 * __wt_bt_read --
 *	Read a cookie referenced block into a buffer.
 */
int
__wt_bt_read(WT_SESSION_IMPL *session,
    WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	const WT_PAGE_HEADER *dsk;
	size_t result_len;

	btree = S2BT(session);
	bm = btree->bm;

	/*
	 * If anticipating a compressed block, read into a scratch buffer and
	 * decompress into the caller's buffer.  Else, read directly into the
	 * caller's buffer.
	 */
	if (btree->compressor == NULL) {
		WT_RET(bm->read(bm, session, buf, addr, addr_size));
		dsk = buf->data;
	} else {
		WT_RET(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->read(bm, session, tmp, addr, addr_size));
		dsk = tmp->data;
	}

	/*
	 * If the block is compressed, copy the skipped bytes of the original
	 * image into place, then decompress.
	 */
	if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) {
		if (btree->compressor == NULL ||
		    btree->compressor->decompress == NULL)
			WT_ERR_MSG(session, WT_ERROR,
			    "read compressed block where no compression engine "
			    "configured");

		/*
		 * We're allocating the exact number of bytes we're expecting
		 * from decompression.
		 */
		WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size));

		/*
		 * Note the source length is NOT the number of compressed bytes,
		 * it's the length of the block we just read (minus the skipped
		 * bytes).  We don't store the number of compressed bytes: some
		 * compression engines need that length stored externally, they
		 * don't have markers in the stream to signal the end of the
		 * compressed bytes.  Those engines must store the compressed
		 * byte length somehow, see the snappy compression extension for
		 * an example.
		 */
		memcpy(buf->mem, tmp->data, WT_BLOCK_COMPRESS_SKIP);
		ret = btree->compressor->decompress(
		    btree->compressor, &session->iface,
		    (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP,
		    tmp->size - WT_BLOCK_COMPRESS_SKIP,
		    (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP,
		    dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len);

		/*
		 * If checksums were turned off because we're depending on the
		 * decompression to fail on any corrupted data, we'll end up
		 * here after corruption happens.  If we're salvaging the file,
		 * it's OK, otherwise it's really, really bad.
		 */
		if (ret != 0 ||
		    result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
			WT_ERR(
			    F_ISSET(btree, WT_BTREE_VERIFY) ||
			    F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
			    WT_ERROR :
			    __wt_illegal_value(session, btree->dhandle->name));
	} else
		if (btree->compressor == NULL)
			buf->size = dsk->mem_size;
		else
			/*
			 * We guessed wrong: there was a compressor, but this
			 * block was not compressed, and now the page is in the
			 * wrong buffer and the buffer may be of the wrong size.
			 * This should be rare, but happens with small blocks
			 * that aren't worth compressing.
			 */
			WT_ERR(__wt_buf_set(
			    session, buf, tmp->data, dsk->mem_size));

	/* If the handle is a verify handle, verify the physical page. */
	if (F_ISSET(btree, WT_BTREE_VERIFY)) {
		if (tmp == NULL)
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
		WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf));
	}

	WT_STAT_FAST_CONN_INCR(session, cache_read);
	WT_STAT_FAST_DATA_INCR(session, cache_read);
	if (F_ISSET(dsk, WT_PAGE_COMPRESSED))
		WT_STAT_FAST_DATA_INCR(session, compress_read);
	WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size);
	WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size);

err:	__wt_scr_free(session, &tmp);
	return (ret);
}
Example #28
0
/*
 * __compact_rewrite --
 *	Return if a page needs to be re-written.
 */
static int
__compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
    WT_BM *bm;
    WT_DECL_RET;
    WT_MULTI *multi;
    WT_PAGE *page;
    WT_PAGE_MODIFY *mod;
    size_t addr_size;
    uint32_t i;
    const uint8_t *addr;

    *skipp = true;					/* Default skip. */

    bm = S2BT(session)->bm;
    page = ref->page;
    mod = page->modify;

    /*
     * Ignore the root: it may not have a replacement address, and besides,
     * if anything else gets written, so will it.
     */
    if (__wt_ref_is_root(ref))
        return (0);

    /* Ignore currently dirty pages, they will be written regardless. */
    if (__wt_page_is_modified(page))
        return (0);

    /*
     * If the page is clean, test the original addresses.
     * If the page is a replacement, test the replacement addresses.
     * Ignore empty pages, they get merged into the parent.
     */
    if (mod == NULL || mod->rec_result == 0) {
        __wt_ref_info(ref, &addr, &addr_size, NULL);
        if (addr == NULL)
            return (0);
        return (
                   bm->compact_page_skip(bm, session, addr, addr_size, skipp));
    }

    /*
     * The page's modification information can change underfoot if the page
     * is being reconciled, serialize with reconciliation.
     */
    if (mod->rec_result == WT_PM_REC_REPLACE ||
            mod->rec_result == WT_PM_REC_MULTIBLOCK)
        WT_RET(__wt_fair_lock(session, &page->page_lock));

    if (mod->rec_result == WT_PM_REC_REPLACE)
        ret = bm->compact_page_skip(bm, session,
                                    mod->mod_replace.addr, mod->mod_replace.size, skipp);

    if (mod->rec_result == WT_PM_REC_MULTIBLOCK)
        for (multi = mod->mod_multi,
                i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
            if (multi->disk_image != NULL)
                continue;
            if ((ret = bm->compact_page_skip(bm, session,
                                             multi->addr.addr, multi->addr.size, skipp)) != 0)
                break;
            if (!*skipp)
                break;
        }

    if (mod->rec_result == WT_PM_REC_REPLACE ||
            mod->rec_result == WT_PM_REC_MULTIBLOCK)
        WT_TRET(__wt_fair_unlock(session, &page->page_lock));

    return (ret);
}
Example #29
0
/*
 * __verify_dsk_row --
 *	Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it.
 */
static int
__verify_dsk_row(
    WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_DECL_ITEM(current);
	WT_DECL_ITEM(last_ovfl);
	WT_DECL_ITEM(last_pfx);
	WT_DECL_RET;
	WT_ITEM *last;
	enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type;
	void *huffman;
	uint32_t cell_num, cell_type, i, key_cnt, prefix;
	uint8_t *end;
	int cmp;

	btree = S2BT(session);
	bm = btree->bm;
	unpack = &_unpack;
	huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key;

	WT_ERR(__wt_scr_alloc(session, 0, &current));
	WT_ERR(__wt_scr_alloc(session, 0, &last_pfx));
	WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl));
	last = last_ovfl;

	end = (uint8_t *)dsk + dsk->mem_size;

	last_cell_type = FIRST;
	cell_num = 0;
	key_cnt = 0;
	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
		++cell_num;

		/* Carefully unpack the cell. */
		if (__wt_cell_unpack_safe(cell, unpack, end) != 0) {
			ret = __err_cell_corrupted(session, cell_num, tag);
			goto err;
		}

		/* Check the raw and collapsed cell types. */
		WT_ERR(__err_cell_type(
		    session, cell_num, tag, unpack->raw, dsk->type));
		WT_ERR(__err_cell_type(
		    session, cell_num, tag, unpack->type, dsk->type));
		cell_type = unpack->type;

		/*
		 * Check ordering relationships between the WT_CELL entries.
		 * For row-store internal pages, check for:
		 *	two values in a row,
		 *	two keys in a row,
		 *	a value as the first cell on a page.
		 * For row-store leaf pages, check for:
		 *	two values in a row,
		 *	a value as the first cell on a page.
		 */
		switch (cell_type) {
		case WT_CELL_KEY:
		case WT_CELL_KEY_OVFL:
			++key_cnt;
			switch (last_cell_type) {
			case FIRST:
			case WAS_VALUE:
				break;
			case WAS_KEY:
				if (dsk->type == WT_PAGE_ROW_LEAF)
					break;
				WT_ERR_VRFY(session,
				    "cell %" PRIu32 " on page at %s is the "
				    "first of two adjacent keys",
				    cell_num - 1, tag);
			}
			last_cell_type = WAS_KEY;
			break;
		case WT_CELL_ADDR_DEL:
		case WT_CELL_ADDR_INT:
		case WT_CELL_ADDR_LEAF:
		case WT_CELL_ADDR_LEAF_NO:
		case WT_CELL_VALUE:
		case WT_CELL_VALUE_OVFL:
			switch (last_cell_type) {
			case FIRST:
				WT_ERR_VRFY(session,
				    "page at %s begins with a value", tag);
			case WAS_KEY:
				break;
			case WAS_VALUE:
				WT_ERR_VRFY(session,
				    "cell %" PRIu32 " on page at %s is the "
				    "first of two adjacent values",
				    cell_num - 1, tag);
			}
			last_cell_type = WAS_VALUE;
			break;
		}

		/* Check if any referenced item has a valid address. */
		switch (cell_type) {
		case WT_CELL_ADDR_DEL:
		case WT_CELL_ADDR_INT:
		case WT_CELL_ADDR_LEAF:
		case WT_CELL_ADDR_LEAF_NO:
		case WT_CELL_KEY_OVFL:
		case WT_CELL_VALUE_OVFL:
			if (!bm->addr_valid(bm,
			    session, unpack->data, unpack->size))
				goto eof;
			break;
		}

		/*
		 * Remaining checks are for key order and prefix compression.
		 * If this cell isn't a key, we're done, move to the next cell.
		 * If this cell is an overflow item, instantiate the key and
		 * compare it with the last key. Otherwise, we have to deal with
		 * prefix compression.
		 */
		switch (cell_type) {
		case WT_CELL_KEY:
			break;
		case WT_CELL_KEY_OVFL:
			WT_ERR(__wt_dsk_cell_data_ref(
			    session, dsk->type, unpack, current));
			goto key_compare;
		default:
			/* Not a key -- continue with the next cell. */
			continue;
		}

		/*
		 * Prefix compression checks.
		 *
		 * Confirm the first non-overflow key on a page has a zero
		 * prefix compression count.
		 */
		prefix = unpack->prefix;
		if (last_pfx->size == 0 && prefix != 0)
			WT_ERR_VRFY(session,
			    "the %" PRIu32 " key on page at %s is the first "
			    "non-overflow key on the page and has a non-zero "
			    "prefix compression value",
			    cell_num, tag);

		/* Confirm the prefix compression count is possible. */
		if (cell_num > 1 && prefix > last->size)
			WT_ERR_VRFY(session,
			    "key %" PRIu32 " on page at %s has a prefix "
			    "compression count of %" PRIu32 ", larger than "
			    "the length of the previous key, %" WT_SIZET_FMT,
			    cell_num, tag, prefix, last->size);

		/*
		 * If Huffman decoding required, unpack the cell to build the
		 * key, then resolve the prefix.  Else, we can do it faster
		 * internally because we don't have to shuffle memory around as
		 * much.
		 */
		if (huffman != NULL) {
			WT_ERR(__wt_dsk_cell_data_ref(
			    session, dsk->type, unpack, current));

			/*
			 * If there's a prefix, make sure there's enough buffer
			 * space, then shift the decoded data past the prefix
			 * and copy the prefix into place.  Take care with the
			 * pointers: current->data may be pointing inside the
			 * buffer.
			 */
			if (prefix != 0) {
				WT_ERR(__wt_buf_grow(
				    session, current, prefix + current->size));
				memmove((uint8_t *)current->mem + prefix,
				    current->data, current->size);
				memcpy(current->mem, last->data, prefix);
				current->data = current->mem;
				current->size += prefix;
			}
		} else {
			/*
			 * Get the cell's data/length and make sure we have
			 * enough buffer space.
			 */
			WT_ERR(__wt_buf_init(
			    session, current, prefix + unpack->size));

			/* Copy the prefix then the data into place. */
			if (prefix != 0)
				memcpy(current->mem, last->data, prefix);
			memcpy((uint8_t *)current->mem + prefix, unpack->data,
			    unpack->size);
			current->size = prefix + unpack->size;
		}

key_compare:	/*
		 * Compare the current key against the last key.
		 *
		 * Be careful about the 0th key on internal pages: we only store
		 * the first byte and custom collators may not be able to handle
		 * truncated keys.
		 */
		if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) ||
		    (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) {
			WT_ERR(__wt_compare(
			    session, btree->collator, last, current, &cmp));
			if (cmp >= 0)
				WT_ERR_VRFY(session,
				    "the %" PRIu32 " and %" PRIu32 " keys on "
				    "page at %s are incorrectly sorted",
				    cell_num - 2, cell_num, tag);
		}

		/*
		 * Swap the buffers: last always references the last key entry,
		 * last_pfx and last_ovfl reference the last prefix-compressed
		 * and last overflow key entries.  Current gets pointed to the
		 * buffer we're not using this time around, which is where the
		 * next key goes.
		 */
		last = current;
		if (cell_type == WT_CELL_KEY) {
			current = last_pfx;
			last_pfx = last;
		} else {
			current = last_ovfl;
			last_ovfl = last;
		}
		WT_ASSERT(session, last != current);
	}
Example #30
0
/*
 * __meta_track_apply --
 *	Apply the changes in a metadata tracking record.
 */
static int
__meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_RET;
	int tret;

	/*
	 * Unlock handles and complete checkpoints regardless of whether we are
	 * unrolling.
	 */
	if (!unroll && trk->op != WT_ST_CHECKPOINT && trk->op != WT_ST_LOCK)
		goto free;

	switch (trk->op) {
	case WT_ST_EMPTY:	/* Unused slot */
		break;
	case WT_ST_CHECKPOINT:	/* Checkpoint, see above */
		if (!unroll) {
			btree = trk->dhandle->handle;
			bm = btree->bm;
			WT_WITH_DHANDLE(session, trk->dhandle,
			    WT_TRET(bm->checkpoint_resolve(bm, session)));
		}
		break;
	case WT_ST_LOCK:	/* Handle lock, see above */
		if (unroll && trk->created)
			F_SET(trk->dhandle, WT_DHANDLE_DISCARD);
		WT_WITH_DHANDLE(session, trk->dhandle,
		    WT_TRET(__wt_session_release_btree(session)));
		break;
	case WT_ST_FILEOP:	/* File operation */
		/*
		 * For renames, both a and b are set.
		 * For creates, a is NULL.
		 * For removes, b is NULL.
		 */
		if (trk->a != NULL && trk->b != NULL &&
		    (tret = __wt_rename(session,
		    trk->b + strlen("file:"),
		    trk->a + strlen("file:"))) != 0) {
			__wt_err(session, tret,
			    "metadata unroll rename %s to %s",
			    trk->b, trk->a);
			WT_TRET(tret);
		} else if (trk->a == NULL) {
			if ((tret = __wt_remove(session,
			    trk->b + strlen("file:"))) != 0) {
				__wt_err(session, tret,
				    "metadata unroll create %s",
				    trk->b);
				WT_TRET(tret);
			}
		}
		/*
		 * We can't undo removes yet: that would imply
		 * some kind of temporary rename and remove in
		 * roll forward.
		 */
		break;
	case WT_ST_REMOVE:	/* Remove trk.a */
		if ((tret = __wt_metadata_remove(session, trk->a)) != 0) {
			__wt_err(session, tret,
			    "metadata unroll remove: %s",
			    trk->a);
			WT_TRET(tret);
		}
		break;
	case WT_ST_SET:		/* Set trk.a to trk.b */
		if ((tret = __wt_metadata_update(
		    session, trk->a, trk->b)) != 0) {
			__wt_err(session, tret,
			    "metadata unroll update %s to %s",
			    trk->a, trk->b);
			WT_TRET(tret);
		}
		break;
	WT_ILLEGAL_VALUE(session);
	}

free:	trk->op = WT_ST_EMPTY;
	__wt_free(session, trk->a);
	__wt_free(session, trk->b);
	trk->dhandle = NULL;

	return (ret);
}