Exemplo n.º 1
0
/*
 * __wt_block_close --
 *	Close a block handle.
 */
int
__wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;

	if (block == NULL)				/* Safety check */
		return (0);

	conn = S2C(session);

	WT_VERBOSE_TRET(session,
	    block, "close: %s", block->name == NULL ? "" : block->name );

	__wt_spin_lock(session, &conn->block_lock);

			/* Reference count is initialized to 1. */
	if (block->ref == 0 || --block->ref == 0)
		WT_TRET(__block_destroy(session, block));

	__wt_spin_unlock(session, &conn->block_lock);

	return (ret);
}
Exemplo n.º 2
0
/*
 * __wt_merge_tree --
 *	Attempt to collapse a stack of split-merge pages in memory into a
 *	shallow tree.  If enough keys are found, create a real internal node
 *	that can be evicted (and, if necessary, split further).
 *
 *	This code is designed to deal with workloads that otherwise create
 *	arbitrarily deep (and slow) trees in memory.
 */
int
__wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
{
	WT_DECL_RET;
	WT_PAGE *lchild, *newtop, *rchild;
	WT_REF *newref;
	WT_VISIT_STATE visit_state;
	uint32_t refcnt, split;
	int promote;
	u_int levels;
	uint8_t page_type;

	WT_CLEAR(visit_state);
	visit_state.session = session;
	lchild = newtop = rchild = NULL;
	page_type = top->type;

	WT_ASSERT(session, __wt_btree_mergeable(top));
	WT_ASSERT(session, top->ref->state == WT_REF_LOCKED);

	/*
	 * Walk the subtree, count the references at the bottom level and
	 * calculate the maximum depth.
	 */
	WT_RET(__merge_walk(session, top, 1, __merge_count, &visit_state));

	/* If there aren't enough useful levels, give up. */
	if (visit_state.maxdepth < WT_MERGE_STACK_MIN)
		return (EBUSY);

	/*
	 * Don't allow split merges to generate arbitrarily large pages.
	 * Ideally we would choose a size based on the internal_page_max
	 * setting for the btree, but we don't have the correct btree handle
	 * available.
	 */
	if (visit_state.refcnt > WT_MERGE_MAX_REFS)
		return (EBUSY);

	/*
	 * Now we either collapse the internal pages into one split-merge page,
	 * or if there are "enough" keys, we split into two equal internal
	 * pages, each of which can be evicted independently.
	 *
	 * We set a flag (WT_PM_REC_SPLIT_MERGE) on the created page if it
	 * isn't big enough to justify the cost of evicting it.  If splits
	 * continue, it will be merged again until it gets over this limit.
	 */
	promote = 0;
	refcnt = (uint32_t)visit_state.refcnt;
	if (refcnt >= WT_MERGE_FULL_PAGE && visit_state.seen_live) {
		/*
		 * In the normal case where there are live children spread
		 * through the subtree, create two child pages.
		 *
		 * Handle the case where the only live child is first / last
		 * specially: put the live child into the top-level page.
		 *
		 * Set SPLIT_MERGE on the internal pages if there are any live
		 * children: they can't be evicted, so there is no point
		 * permanently deepening the tree.
		 */
		if (visit_state.first_live == visit_state.last_live &&
		    (visit_state.first_live == 0 ||
		    visit_state.first_live == refcnt - 1))
			split = (visit_state.first_live == 0) ? 1 : refcnt - 1;
		else
			split = (refcnt + 1) / 2;

		/* Only promote if we can create a real page. */
		if (split == 1 || split == refcnt - 1)
			promote = 1;
		else if (split >= WT_MERGE_FULL_PAGE &&
		    visit_state.first_live >= split)
			promote = 1;
		else if (refcnt - split >= WT_MERGE_FULL_PAGE &&
		    visit_state.last_live < split)
			promote = 1;
	}

	if (promote) {
		/* Create a new top-level split-merge page with two entries. */
		WT_ERR(__merge_new_page(session, page_type, 2, 1, &newtop));

		visit_state.split = split;

		/* Left split. */
		if (split == 1)
			visit_state.first = newtop;
		else {
			WT_ERR(__merge_new_page(session, page_type, split,
			    visit_state.first_live < split, &lchild));
			visit_state.first = lchild;
		}

		/* Right split. */
		if (split == refcnt - 1) {
			visit_state.second = newtop;
			visit_state.second_ref = &newtop->u.intl.t[1];
		} else {
			WT_ERR(__merge_new_page(session, page_type,
			    refcnt - split, visit_state.last_live >= split,
			    &rchild));
			visit_state.second = rchild;
			visit_state.second_ref =
			    &visit_state.second->u.intl.t[0];
		}
	} else {
		/*
		 * Create a new split-merge page for small merges, or if the
		 * page above is a split merge page.  When we do a big enough
		 * merge, we create a real page at the top and don't consider
		 * it as a merge candidate again.  Over time with an insert
		 * workload the tree will grow deeper, but that's inevitable,
		 * and this keeps individual merges small.
		 */
		WT_ERR(__merge_new_page(session, page_type, refcnt,
		    refcnt < WT_MERGE_FULL_PAGE ||
		    __wt_btree_mergeable(top->parent),
		    &newtop));

		visit_state.first = newtop;
	}

	/*
	 * Copy the references into the new tree, but don't update anything in
	 * the locked tree in case there is an error and we need to back out.
	 * We do this in a separate pass so that we can figure out the key for
	 * the split point: that allocates memory and so it could still fail.
	 */
	visit_state.page = visit_state.first;
	visit_state.ref = visit_state.page->u.intl.t;
	visit_state.refcnt = 0;
	WT_ERR(__merge_walk(session, top, 0, __merge_copy_ref, &visit_state));

	if (promote) {
		/* Promote keys into the top-level page. */
		if (lchild != NULL) {
			newref = &newtop->u.intl.t[0];
			WT_LINK_PAGE(newtop, newref, lchild);
			newref->state = WT_REF_MEM;
			WT_ERR(__merge_promote_key(session, newref));
		}

		if (rchild != NULL) {
			newref = &newtop->u.intl.t[1];
			WT_LINK_PAGE(newtop, newref, rchild);
			newref->state = WT_REF_MEM;
			WT_ERR(__merge_promote_key(session, newref));
		}
	}

	/*
	 * We have copied everything into place and allocated all of the memory
	 * we need.  Now link all pages into the new tree and unlock them.
	 *
	 * The only way this could fail is if a reference state has been
	 * changed by another thread since they were locked.  Panic in that
	 * case: that should never happen.
	 */
	visit_state.page = visit_state.first;
	visit_state.ref = visit_state.page->u.intl.t;
	visit_state.refcnt = 0;
	ret = __merge_walk(session, top, 0, __merge_switch_page, &visit_state);

	if (ret != 0)
		WT_ERR(__wt_illegal_value(session, "__wt_merge_tree"));

	newtop->u.intl.recno = top->u.intl.recno;
	newtop->parent = top->parent;
	newtop->ref = top->ref;

#ifdef HAVE_DIAGNOSTIC
	/*
	 * Before swapping in the new tree, walk the pages we are discarding,
	 * check that everything looks right.
	 */
	__merge_check_discard(session, top);
#endif

	/*
	 * Set up the new top-level page as a split so that it will be swapped
	 * into place by our caller.
	 */
	top->modify->flags = WT_PM_REC_SPLIT;
	top->modify->u.split = newtop;

	WT_VERBOSE_ERR(session, evict,
	    "Successfully %s %" PRIu32
	    " split-merge pages containing %" PRIu32 " keys\n",
	    promote ? "promoted" : "merged", visit_state.maxdepth, refcnt);

	/* Evict new child pages as soon as possible. */
	if (lchild != NULL && !F_ISSET(lchild->modify, WT_PM_REC_SPLIT_MERGE))
		lchild->read_gen = WT_READ_GEN_OLDEST;
	if (rchild != NULL && !F_ISSET(rchild->modify, WT_PM_REC_SPLIT_MERGE))
		rchild->read_gen = WT_READ_GEN_OLDEST;

	/* Update statistics. */
	WT_CSTAT_INCR(session, cache_eviction_merge);
	WT_DSTAT_INCR(session, cache_eviction_merge);

	/* How many levels did we remove? */
	levels = visit_state.maxdepth - (promote ? 2 : 1);
	WT_CSTAT_INCRV(session, cache_eviction_merge_levels, levels);
	WT_DSTAT_INCRV(session, cache_eviction_merge_levels, levels);

	return (0);

err:	WT_VERBOSE_TRET(session, evict,
	    "Failed to merge %" PRIu32
	    " split-merge pages containing %" PRIu32 " keys\n",
	    visit_state.maxdepth, refcnt);

	WT_CSTAT_INCR(session, cache_eviction_merge_fail);
	WT_DSTAT_INCR(session, cache_eviction_merge_fail);

	if (newtop != NULL)
		__wt_page_out(session, &newtop);
	if (lchild != NULL)
		__wt_page_out(session, &lchild);
	if (rchild != NULL)
		__wt_page_out(session, &rchild);
	return (ret);
}
Exemplo n.º 3
0
/*
 * __wt_block_open --
 *	Open a block handle.
 */
int
__wt_block_open(WT_SESSION_IMPL *session,
    const char *filename, const char *cfg[],
    int forced_salvage, uint32_t allocsize, WT_BLOCK **blockp)
{
	WT_BLOCK *block;
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;

	WT_VERBOSE_TRET(session, block, "open: %s", filename);

	conn = S2C(session);
	*blockp = NULL;

	__wt_spin_lock(session, &conn->block_lock);
	TAILQ_FOREACH(block, &conn->blockqh, q)
		if (strcmp(filename, block->name) == 0) {
			++block->ref;
			*blockp = block;
			__wt_spin_unlock(session, &conn->block_lock);
			return (0);
		}

	/* Basic structure allocation, initialization. */
	WT_ERR(__wt_calloc_def(session, 1, &block));
	block->ref = 1;
	TAILQ_INSERT_HEAD(&conn->blockqh, block, q);

	WT_ERR(__wt_strdup(session, filename, &block->name));
	block->allocsize = allocsize;

	WT_ERR(__wt_config_gets(session, cfg, "block_allocation", &cval));
	block->allocfirst =
	    WT_STRING_MATCH("first", cval.str, cval.len) ? 1 : 0;

	/* Configuration: optional OS buffer cache maximum size. */
	WT_ERR(__wt_config_gets(session, cfg, "os_cache_max", &cval));
	block->os_cache_max = cval.val;
#ifdef HAVE_POSIX_FADVISE
	if (conn->direct_io && block->os_cache_max)
		WT_ERR_MSG(session, EINVAL,
		    "os_cache_max not supported in combination with direct_io");
#else
	if (block->os_cache_max)
		WT_ERR_MSG(session, EINVAL,
		    "os_cache_max not supported if posix_fadvise not "
		    "available");
#endif

	/* Configuration: optional immediate write scheduling flag. */
	WT_ERR(__wt_config_gets(session, cfg, "os_cache_dirty_max", &cval));
	block->os_cache_dirty_max = cval.val;
#ifdef HAVE_SYNC_FILE_RANGE
	if (conn->direct_io && block->os_cache_dirty_max)
		WT_ERR_MSG(session, EINVAL,
		    "os_cache_dirty_max not supported in combination with "
		    "direct_io");
#else
	if (block->os_cache_dirty_max)
		WT_ERR_MSG(session, EINVAL,
		    "os_cache_dirty_max not supported if sync_file_range not "
		    "available");
#endif

	/* Open the underlying file handle. */
	WT_ERR(__wt_open(
	    session, filename, 0, 0, WT_FILE_TYPE_DATA, &block->fh));

	/* Initialize the live checkpoint's lock. */
	WT_ERR(__wt_spin_init(session, &block->live_lock, "block manager"));

	/*
	 * Read the description information from the first block.
	 *
	 * Salvage is a special case: if we're forcing the salvage, we don't
	 * look at anything, including the description information.
	 */
	if (!forced_salvage)
		WT_ERR(__desc_read(session, block));

	*blockp = block;
	__wt_spin_unlock(session, &conn->block_lock);
	return (0);

err:	WT_TRET(__block_destroy(session, block));
	__wt_spin_unlock(session, &conn->block_lock);
	return (ret);
}