Пример #1
0
/*
 * __session_commit_transaction --
 *	WT_SESSION->commit_transaction method.
 */
static int
__session_commit_transaction(WT_SESSION *wt_session, const char *config)
{
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	WT_TXN *txn;

	session = (WT_SESSION_IMPL *)wt_session;
	SESSION_API_CALL(session, commit_transaction, config, cfg);
	WT_CSTAT_INCR(session, txn_commit);

	txn = &session->txn;
	if (F_ISSET(txn, TXN_ERROR)) {
		__wt_errx(session, "failed transaction requires rollback");
		ret = EINVAL;
	}

	WT_TRET(__session_reset_cursors(session));

	if (ret == 0)
		ret = __wt_txn_commit(session, cfg);
	else
		(void)__wt_txn_rollback(session, cfg);

err:	API_END(session);
	return (ret);
}
Пример #2
0
/*
 * __wt_cond_wait
 *	Wait on a mutex, optionally timing out.
 */
int
__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs)
{
	struct timespec ts;
	WT_DECL_RET;
	int locked;

	locked = 0;

	/*
	 * !!!
	 * This function MUST handle a NULL session handle.
	 */
	if (session != NULL) {
		WT_VERBOSE_RET(
		    session, mutex, "wait %s cond (%p)", cond->name, cond);
		WT_CSTAT_INCR(session, cond_wait);
	}

	WT_ERR(pthread_mutex_lock(&cond->mtx));
	locked = 1;

	while (!cond->signalled) {
		if (usecs > 0) {
			WT_ERR(__wt_epoch(session, &ts));
			ts.tv_sec += (ts.tv_nsec + 1000 * usecs) / WT_BILLION;
			ts.tv_nsec = (ts.tv_nsec + 1000 * usecs) % WT_BILLION;
			ret = pthread_cond_timedwait(
			    &cond->cond, &cond->mtx, &ts);
			if (ret == ETIMEDOUT) {
				ret = 0;
				break;
			}
		} else
			ret = pthread_cond_wait(&cond->cond, &cond->mtx);

		/*
		 * Check pthread_cond_wait() return for EINTR, ETIME and
		 * ETIMEDOUT, some systems return these errors.
		 */
		if (ret == EINTR ||
#ifdef ETIME
		    ret == ETIME ||
#endif
		    ret == ETIMEDOUT)
			ret = 0;
		WT_ERR(ret);
	}

	cond->signalled = 0;

err:	if (locked)
		WT_TRET(pthread_mutex_unlock(&cond->mtx));
	if (ret == 0)
		return (0);
	WT_RET_MSG(session, ret, "pthread_cond_wait");
}
Пример #3
0
/*
 * __wt_writelock
 *	Wait to get an exclusive lock.
 */
int
__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
	WT_DECL_RET;

	WT_VERBOSE_RET(session, mutex,
	    "rwlock: writelock %s (%p)", rwlock->name, rwlock);
	WT_CSTAT_INCR(session, rwlock_write);

	if ((ret = pthread_rwlock_wrlock(&rwlock->rwlock)) == 0)
		return (0);
	WT_RET_MSG(session, ret, "pthread_rwlock_wrlock: %s", rwlock->name);
}
Пример #4
0
/*
 * __wt_realloc --
 *	ANSI realloc function.
 */
int
__wt_realloc(WT_SESSION_IMPL *session,
    size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
{
	void *p;
	size_t bytes_allocated;

	/*
	 * !!!
	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
	 */
	WT_ASSERT(session, bytes_to_allocate != 0);

	/*
	 * Sometimes we're allocating memory and we don't care about the
	 * final length -- bytes_allocated_ret may be NULL.
	 */
	bytes_allocated = (bytes_allocated_ret == NULL) ?
	    0 : *bytes_allocated_ret;
	WT_ASSERT(session, bytes_allocated < bytes_to_allocate);

	p = *(void **)retp;

	if (p == NULL && session != NULL && S2C(session)->stats != NULL)
		WT_CSTAT_INCR(session, memalloc);

	if ((p = realloc(p, bytes_to_allocate)) == NULL)
		WT_RET_MSG(session, __wt_errno(), "memory allocation");

	/*
	 * Clear the allocated memory -- an application might: allocate memory,
	 * write secret stuff into it, free the memory, then we re-allocate the
	 * memory and use it for a file page or log record, and then write it to
	 * disk.  That would result in the secret stuff being protected by the
	 * WiredTiger permission mechanisms, potentially inappropriate for the
	 * secret stuff.
	 */
	memset((uint8_t *)
	    p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated);

	/* Update caller's bytes allocated value. */
	if (bytes_allocated_ret != NULL)
		*bytes_allocated_ret = bytes_to_allocate;

	*(void **)retp = p;
	return (0);
}
Пример #5
0
/*
 * __session_rollback_transaction --
 *	WT_SESSION->rollback_transaction method.
 */
static int
__session_rollback_transaction(WT_SESSION *wt_session, const char *config)
{
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	session = (WT_SESSION_IMPL *)wt_session;
	SESSION_API_CALL(session, rollback_transaction, config, cfg);
	WT_CSTAT_INCR(session, txn_rollback);

	WT_TRET(__session_reset_cursors(session));

	WT_TRET(__wt_txn_rollback(session, cfg));

err:	API_END(session);
	return (ret);
}
Пример #6
0
/*
 * __session_checkpoint --
 *	WT_SESSION->checkpoint method.
 */
static int
__session_checkpoint(WT_SESSION *wt_session, const char *config)
{
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	WT_TXN *txn;

	session = (WT_SESSION_IMPL *)wt_session;
	txn = &session->txn;

	WT_CSTAT_INCR(session, checkpoint);
	SESSION_API_CALL(session, checkpoint, config, cfg);

	/*
	 * Checkpoints require a snapshot to write a transactionally consistent
	 * snapshot of the data.
	 *
	 * We can't use an application's transaction: if it has uncommitted
	 * changes, they will be written in the checkpoint and may appear after
	 * a crash.
	 *
	 * Use a real snapshot transaction: we don't want any chance of the
	 * snapshot being updated during the checkpoint.  Eviction is prevented
	 * from evicting anything newer than this because we track the oldest
	 * transaction ID in the system that is not visible to all readers.
	 */
	if (F_ISSET(txn, TXN_RUNNING))
		WT_ERR_MSG(session, EINVAL,
		    "Checkpoint not permitted in a transaction");

	/*
	 * Reset open cursors.
	 *
	 * We do this here explicitly even though it will happen implicitly in
	 * the call to begin_transaction for the checkpoint, in case some
	 * implementation of WT_CURSOR::reset needs the schema lock.
	 */
	WT_ERR(__session_reset_cursors(session));

	WT_WITH_SCHEMA_LOCK(session,
	    ret = __wt_txn_checkpoint(session, cfg));

err:	API_END_NOTFOUND_MAP(session, ret);
}
Пример #7
0
/*
 * __wt_calloc --
 *	ANSI calloc function.
 */
int
__wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp)
{
	void *p;

	/*
	 * !!!
	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
	 */
	WT_ASSERT(session, number != 0 && size != 0);

	if (session != NULL && S2C(session)->stats != NULL)
		WT_CSTAT_INCR(session, memalloc);

	if ((p = calloc(number, size)) == NULL)
		WT_RET_MSG(session, __wt_errno(), "memory allocation");

	*(void **)retp = p;
	return (0);
}
Пример #8
0
/*
 * __session_begin_transaction --
 *	WT_SESSION->begin_transaction method.
 */
static int
__session_begin_transaction(WT_SESSION *wt_session, const char *config)
{
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	session = (WT_SESSION_IMPL *)wt_session;
	SESSION_API_CALL(session, begin_transaction, config, cfg);
	WT_CSTAT_INCR(session, txn_begin);

	if (!F_ISSET(S2C(session), WT_CONN_TRANSACTIONAL))
		WT_ERR_MSG(session, EINVAL,
		    "Database not configured for transactions");
	if (F_ISSET(&session->txn, TXN_RUNNING))
		WT_ERR_MSG(session, EINVAL, "Transaction already running");

	WT_ERR(__session_reset_cursors(session));

	ret = __wt_txn_begin(session, cfg);

err:	API_END(session);
	return (ret);
}
Пример #9
0
/*
 * __hazard_exclusive --
 *	Request exclusive access to a page.
 */
static int
__hazard_exclusive(WT_SESSION_IMPL *session, WT_REF *ref, int top)
{
	/*
	 * Make sure there is space to track exclusive access so we can unlock
	 * to clean up.
	 */
	if (session->excl_next * sizeof(WT_REF *) == session->excl_allocated)
		WT_RET(__wt_realloc(session, &session->excl_allocated,
		    (session->excl_next + 50) * sizeof(WT_REF *),
		    &session->excl));

	/*
	 * Hazard pointers are acquired down the tree, which means we can't
	 * deadlock.
	 *
	 * Request exclusive access to the page.  The top-level page should
	 * already be in the locked state, lock child pages in memory.
	 * If another thread already has this page, give up.
	 */
	if (!top && !WT_ATOMIC_CAS(ref->state, WT_REF_MEM, WT_REF_LOCKED))
		return (EBUSY);	/* We couldn't change the state. */
	WT_ASSERT(session, ref->state == WT_REF_LOCKED);

	session->excl[session->excl_next++] = ref;

	/* Check for a matching hazard pointer. */
	if (__wt_page_hazard_check(session, ref->page) == NULL)
		return (0);

	WT_DSTAT_INCR(session, cache_eviction_hazard);
	WT_CSTAT_INCR(session, cache_eviction_hazard);

	WT_VERBOSE_RET(
	    session, evict, "page %p hazard request failed", ref->page);
	return (EBUSY);
}
Пример #10
0
/*
 * __wt_free_int --
 *	ANSI free function.
 */
void
__wt_free_int(WT_SESSION_IMPL *session, void *p_arg)
{
	void *p;

	/*
	 * !!!
	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
	 */
	if (session != NULL && S2C(session)->stats != NULL)
		WT_CSTAT_INCR(session, memfree);

	/*
	 * If there's a serialization bug we might race with another thread.
	 * We can't avoid the race (and we aren't willing to flush memory),
	 * but we minimize the window by clearing the free address atomically,
	 * hoping a racing thread will see, and won't free, a NULL pointer.
	 */
	p = *(void **)p_arg;
	*(void **)p_arg = NULL;

	if (p != NULL)			/* ANSI C free semantics */
		free(p);
}
Пример #11
0
/*
 * __wt_block_read_off --
 *	Read an addr/size pair referenced block into a buffer.
 */
int
__wt_block_read_off(WT_SESSION_IMPL *session,
    WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum)
{
	WT_BLOCK_HEADER *blk;
	uint32_t alloc_size, page_cksum;

	WT_VERBOSE_RET(session, read,
	    "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32,
	    (uintmax_t)offset, size, cksum);

#ifdef HAVE_DIAGNOSTIC
	/*
	 * In diagnostic mode, verify the block we're about to read isn't on
	 * either the available or discard lists.
	 *
	 * Don't check during salvage, it's possible we're reading an already
	 * freed overflow page.
	 */
	if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR))
		WT_RET(
		    __wt_block_misplaced(session, block, "read", offset, size));
#endif

	/*
	 * Grow the buffer as necessary and read the block.  Buffers should be
	 * aligned for reading, but there are lots of buffers (for example, file
	 * cursors have two buffers each, key and value), and it's difficult to
	 * be sure we've found all of them.  If the buffer isn't aligned, it's
	 * an easy fix: set the flag and guarantee we reallocate it.  (Most of
	 * the time on reads, the buffer memory has not yet been allocated, so
	 * we're not adding any additional processing time.)
	 */
	if (F_ISSET(buf, WT_ITEM_ALIGNED))
		alloc_size = size;
	else {
		F_SET(buf, WT_ITEM_ALIGNED);
		alloc_size = (uint32_t)WT_MAX(size, buf->memsize + 10);
	}
	WT_RET(__wt_buf_init(session, buf, alloc_size));
	WT_RET(__wt_read(session, block->fh, offset, size, buf->mem));
	buf->size = size;

	blk = WT_BLOCK_HEADER_REF(buf->mem);
	blk->cksum = 0;
	page_cksum = __wt_cksum(buf->mem,
	    F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP);
	if (cksum != page_cksum) {
		if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR))
			__wt_errx(session,
			    "read checksum error [%"
			    PRIu32 "B @ %" PRIuMAX ", %"
			    PRIu32 " != %" PRIu32 "]",
			    size, (uintmax_t)offset, cksum, page_cksum);
		return (WT_ERROR);
	}

	WT_CSTAT_INCR(session, block_read);
	WT_CSTAT_INCRV(session, block_byte_read, size);
	return (0);
}
Пример #12
0
/*
 * __rec_review --
 *	Get exclusive access to the page and review the page and its subtree
 *	for conditions that would block its eviction.
 *
 *	The ref and page arguments may appear to be redundant, because usually
 *	ref->page == page and page->ref == ref.  However, we need both because
 *	(a) there are cases where ref == NULL (e.g., for root page or during
 *	salvage), and (b) we can't safely look at page->ref until we have a
 *	hazard pointer.
 */
static int
__rec_review(WT_SESSION_IMPL *session,
    WT_REF *ref, WT_PAGE *page, int exclusive, int merge, int top)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE_MODIFY *mod;
	WT_PAGE *t;
	uint32_t i;

	btree = session->btree;

	/*
	 * Get exclusive access to the page if our caller doesn't have the tree
	 * locked down.
	 */
	if (!exclusive)
		WT_RET(__hazard_exclusive(session, ref, top));

	/*
	 * Recurse through the page's subtree: this happens first because we
	 * have to write pages in depth-first order, otherwise we'll dirty
	 * pages after we've written them.
	 */
	if (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)
		WT_REF_FOREACH(page, ref, i)
			switch (ref->state) {
			case WT_REF_DISK:		/* On-disk */
			case WT_REF_DELETED:		/* On-disk, deleted */
				break;
			case WT_REF_MEM:		/* In-memory */
				WT_RET(__rec_review(session,
				    ref, ref->page, exclusive, merge, 0));
				break;
			case WT_REF_EVICT_WALK:		/* Walk point */
			case WT_REF_EVICT_FORCE:	/* Forced evict */
			case WT_REF_LOCKED:		/* Being evicted */
			case WT_REF_READING:		/* Being read */
				return (EBUSY);
			}

	/*
	 * If the file is being checkpointed, we cannot evict dirty pages,
	 * because that may free a page that appears on an internal page in the
	 * checkpoint.  Don't rely on new updates being skipped by the
	 * transaction used for transaction reads: (1) there are paths that
	 * dirty pages for artificial reasons; (2) internal pages aren't
	 * transactional; and (3) if an update was skipped during the
	 * checkpoint (leaving the page dirty), then rolled back, we could
	 * still successfully overwrite a page and corrupt the checkpoint.
	 *
	 * Further, even for clean pages, the checkpoint's reconciliation of an
	 * internal page might race with us as we evict a child in the page's
	 * subtree.
	 *
	 * One half of that test is in the reconciliation code: the checkpoint
	 * thread waits for eviction-locked pages to settle before determining
	 * their status.  The other half of the test is here: after acquiring
	 * the exclusive eviction lock on a page, confirm no page in the page's
	 * stack of pages from the root is being reconciled in a checkpoint.
	 * This ensures we either see the checkpoint-walk state here, or the
	 * reconciliation of the internal page sees our exclusive lock on the
	 * child page and waits until we're finished evicting the child page
	 * (or give up if eviction isn't possible).
	 *
	 * We must check the full stack (we might be attempting to evict a leaf
	 * page multiple levels beneath the internal page being reconciled as
	 * part of the checkpoint, and  all of the intermediate nodes are being
	 * merged into the internal page).
	 *
	 * There's no simple test for knowing if a page in our page stack is
	 * involved in a checkpoint.  The internal page's checkpoint-walk flag
	 * is the best test, but it's not set anywhere for the root page, it's
	 * not a complete test.
	 *
	 * Quit for any page that's not a simple, in-memory page.  (Almost the
	 * same as checking for the checkpoint-walk flag.  I don't think there
	 * are code paths that change the page's status from checkpoint-walk,
	 * but these races are hard enough I'm not going to proceed if there's
	 * anything other than a vanilla, in-memory tree stack.)  Climb until
	 * we find a page which can't be merged into its parent, and failing if
	 * we never find such a page.
	 */
	if (btree->checkpointing && !merge && __wt_page_is_modified(page)) {
ckpt:		WT_CSTAT_INCR(session, cache_eviction_checkpoint);
		WT_DSTAT_INCR(session, cache_eviction_checkpoint);
		return (EBUSY);
	}

	if (btree->checkpointing && top)
		for (t = page->parent;; t = t->parent) {
			if (t == NULL || t->ref == NULL)	/* root */
				goto ckpt;
			if (t->ref->state != WT_REF_MEM)	/* scary */
				goto ckpt;
			if (t->modify == NULL ||		/* not merged */
			    !F_ISSET(t->modify, WT_PM_REC_EMPTY |
			    WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))
				break;
		}

	/*
	 * If we are merging internal pages, we just need exclusive access, we
	 * don't need to write everything.
	 */
	if (merge)
		return (0);

	/*
	 * Fail if any page in the top-level page's subtree won't be merged into
	 * its parent, the page that cannot be merged must be evicted first.
	 * The test is necessary but should not fire much: the eviction code is
	 * biased for leaf pages, an internal page shouldn't be selected for
	 * eviction until its children have been evicted.
	 *
	 * We have to write dirty pages to know their final state, a page marked
	 * empty may have had records added since reconciliation, a page marked
	 * split may have had records deleted and no longer need to split.
	 * Split-merge pages are the exception: they can never be change into
	 * anything other than a split-merge page and are merged regardless of
	 * being clean or dirty.
	 *
	 * Writing the page is expensive, do a cheap test first: if it doesn't
	 * appear a subtree page can be merged, quit.  It's possible the page
	 * has been emptied since it was last reconciled, and writing it before
	 * testing might be worthwhile, but it's more probable we're attempting
	 * to evict an internal page with live children, and that's a waste of
	 * time.
	 */
	mod = page->modify;
	if (!top && (mod == NULL || !F_ISSET(mod,
	    WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)))
		return (EBUSY);

	/*
	 * If the page is dirty and can possibly change state, write it so we
	 * know the final state.
	 */
	if (__wt_page_is_modified(page) &&
	    !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) {
		ret = __wt_rec_write(session, page,
		    NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_QUIT);

		/*
		 * Update the page's modification reference, reconciliation
		 * might have changed it.
		 */
		mod = page->modify;

		/* If there are unwritten changes on the page, give up. */
		if (ret == EBUSY) {
			WT_VERBOSE_RET(session, evict,
			    "eviction failed, reconciled page not clean");

			/* 
			 * We may be able to discard any "update" memory the
			 * page no longer needs.
			 */
			switch (page->type) {
			case WT_PAGE_COL_FIX:
			case WT_PAGE_COL_VAR:
				__wt_col_leaf_obsolete(session, page);
				break;
			case WT_PAGE_ROW_LEAF:
				__wt_row_leaf_obsolete(session, page);
				break;
			}
		}
		WT_RET(ret);

		WT_ASSERT(session, __wt_page_is_modified(page) == 0);
	}

	/*
	 * Repeat the test: fail if any page in the top-level page's subtree
	 * won't be merged into its parent.
	 */
	if (!top && (mod == NULL || !F_ISSET(mod,
	    WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)))
		return (EBUSY);
	return (0);
}
Пример #13
0
/*
 * __wt_rec_evict --
 *	Reconciliation plus eviction.
 */
int
__wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive)
{
	WT_DECL_RET;
	WT_PAGE_MODIFY *mod;
	int merge;

	WT_VERBOSE_RET(session, evict,
	    "page %p (%s)", page, __wt_page_type_string(page->type));

	WT_ASSERT(session, session->excl_next == 0);

	/*
	 * If we get a split-merge page during normal eviction, try to collapse
	 * it.  During close, it will be merged into its parent.
	 */
	mod = page->modify;
	merge = __wt_btree_mergeable(page);
	if (merge && exclusive)
		return (EBUSY);

	WT_ASSERT(session, merge || mod == NULL ||
	    !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE));

	/*
	 * Get exclusive access to the page and review the page and its subtree
	 * for conditions that would block our eviction of the page.  If the
	 * check fails (for example, we find a child page that can't be merged),
	 * we're done.  We have to make this check for clean pages, too: while
	 * unlikely eviction would choose an internal page with children, it's
	 * not disallowed anywhere.
	 *
	 * Note that page->ref may be NULL in some cases (e.g., for root pages
	 * or during salvage).  That's OK if exclusive is set: we won't check
	 * hazard pointers in that case.
	 */
	WT_ERR(__rec_review(session, page->ref, page, exclusive, merge, 1));

	/* Try to merge internal pages. */
	if (merge)
		WT_ERR(__wt_merge_tree(session, page));

	/*
	 * Update the page's modification reference, reconciliation might have
	 * changed it.
	 */
	mod = page->modify;

	/* Count evictions of internal pages during normal operation. */
	if (!exclusive && !merge &&
	    (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) {
		WT_CSTAT_INCR(session, cache_eviction_internal);
		WT_DSTAT_INCR(session, cache_eviction_internal);
	}

	/*
	 * Update the parent and discard the page.
	 */
	if (mod == NULL || !F_ISSET(mod, WT_PM_REC_MASK)) {
		WT_ASSERT(session,
		    exclusive || page->ref->state == WT_REF_LOCKED);

		if (WT_PAGE_IS_ROOT(page))
			__rec_root_update(session);
		else
			__rec_page_clean_update(session, page);

		/* Discard the page. */
		__rec_discard_page(session, page, exclusive);

		WT_CSTAT_INCR(session, cache_eviction_clean);
		WT_DSTAT_INCR(session, cache_eviction_clean);
	} else {
		if (WT_PAGE_IS_ROOT(page))
			__rec_root_update(session);
		else
			WT_ERR(__rec_page_dirty_update(session, page));

		/* Discard the tree rooted in this page. */
		__rec_discard_tree(session, page, exclusive);

		WT_CSTAT_INCR(session, cache_eviction_dirty);
		WT_DSTAT_INCR(session, cache_eviction_dirty);
	}
	if (0) {
err:		/*
		 * If unable to evict this page, release exclusive reference(s)
		 * we've acquired.
		 */
		__rec_excl_clear(session);

		WT_CSTAT_INCR(session, cache_eviction_fail);
		WT_DSTAT_INCR(session, cache_eviction_fail);
	}
	session->excl_next = 0;

	return (ret);
}
Пример #14
0
/*
 * __wt_bt_read --
 *	Read a cookie referenced block into a buffer.
 */
int
__wt_bt_read(WT_SESSION_IMPL *session,
    WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_PAGE_HEADER *dsk;
	size_t result_len;

	btree = session->btree;
	bm = btree->bm;

	/*
	 * If anticipating a compressed block, read into a scratch buffer and
	 * decompress into the caller's buffer.  Else, read directly into the
	 * caller's buffer.
	 */
	if (btree->compressor == NULL) {
		WT_RET(bm->read(bm, session, buf, addr, addr_size));
		dsk = buf->mem;
	} else {
		WT_RET(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->read(bm, session, tmp, addr, addr_size));
		dsk = tmp->mem;
	}

	/*
	 * If the block is compressed, copy the skipped bytes of the original
	 * image into place, then decompress.
	 */
	if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) {
		if (btree->compressor == NULL ||
		    btree->compressor->decompress == NULL)
			WT_ERR_MSG(session, WT_ERROR,
			    "read compressed block where no compression engine "
			    "configured");

		/*
		 * We're allocating the exact number of bytes we're expecting
		 * from decompression.
		 */
		WT_ERR(__wt_buf_init(session, buf, dsk->mem_size));
		buf->size = dsk->mem_size;

		/*
		 * Note the source length is NOT the number of compressed bytes,
		 * it's the length of the block we just read (minus the skipped
		 * bytes).  We don't store the number of compressed bytes: some
		 * compression engines need that length stored externally, they
		 * don't have markers in the stream to signal the end of the
		 * compressed bytes.  Those engines must store the compressed
		 * byte length somehow, see the snappy compression extension for
		 * an example.
		 */
		memcpy(buf->mem, tmp->mem, WT_BLOCK_COMPRESS_SKIP);
		WT_ERR(btree->compressor->decompress(
		    btree->compressor, &session->iface,
		    (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP,
		    tmp->size - WT_BLOCK_COMPRESS_SKIP,
		    (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP,
		    dsk->mem_size - WT_BLOCK_COMPRESS_SKIP,
		    &result_len));

		/*
		 * If checksums were turned off because we're depending on the
		 * decompression to fail on any corrupted data, we'll end up
		 * here after corruption happens.  If we're salvaging the file,
		 * it's OK, otherwise it's really, really bad.
		 */
		if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
			WT_ERR(
			    F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR) ?
			    WT_ERROR :
			    __wt_illegal_value(session, btree->name));
	} else
		if (btree->compressor == NULL)
			buf->size = dsk->mem_size;
		else
			/*
			 * We guessed wrong: there was a compressor, but this
			 * block was not compressed, and now the page is in the
			 * wrong buffer and the buffer may be of the wrong size.
			 * This should be rare, but happens with small blocks
			 * that aren't worth compressing.
			 */
			WT_ERR(__wt_buf_set(
			    session, buf, tmp->data, dsk->mem_size));

	/* If the handle is a verify handle, verify the physical page. */
	if (F_ISSET(btree, WT_BTREE_VERIFY)) {
		if (tmp == NULL)
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
		WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf));
	}

	WT_CSTAT_INCR(session, cache_read);
	WT_DSTAT_INCR(session, cache_read);
	if (F_ISSET(dsk, WT_PAGE_COMPRESSED))
		WT_DSTAT_INCR(session, compress_read);
	WT_CSTAT_INCRV(session, cache_bytes_read, addr_size);
	WT_DSTAT_INCRV(session, cache_bytes_read, addr_size);

err:	__wt_scr_free(&tmp);
	return (ret);
}
Пример #15
0
/*
 * __wt_bt_write --
 *	Write a buffer into a block, returning the block's addr/size and
 * checksum.
 */
int
__wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
    uint8_t *addr, uint32_t *addr_size, int checkpoint, int compressed)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_ITEM *ip;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_PAGE_HEADER *dsk;
	size_t len, src_len, dst_len, result_len, size;
	int data_cksum, compression_failed;
	uint8_t *src, *dst;

	btree = session->btree;
	bm = btree->bm;

	/* Checkpoint calls are different than standard calls. */
	WT_ASSERT(session,
	    (checkpoint == 0 && addr != NULL && addr_size != NULL) ||
	    (checkpoint == 1 && addr == NULL && addr_size == NULL));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * We're passed a table's disk image.  Decompress if necessary and
	 * verify the image.  Always check the in-memory length for accuracy.
	 */
	dsk = buf->mem;
	if (compressed) {
		WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &tmp));

		memcpy(tmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP);
		WT_ERR(btree->compressor->decompress(
		    btree->compressor, &session->iface,
		    (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP,
		    buf->size - WT_BLOCK_COMPRESS_SKIP,
		    (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP,
		    tmp->memsize - WT_BLOCK_COMPRESS_SKIP,
		    &result_len));
		WT_ASSERT(session,
		    dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP);
		tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP;
		ip = tmp;
	} else {
		WT_ASSERT(session, dsk->mem_size == buf->size);
		ip = buf;
	}
	WT_ERR(__wt_verify_dsk(session, "[write-check]", ip));
	__wt_scr_free(&tmp);
#endif

	/*
	 * Optionally stream-compress the data, but don't compress blocks that
	 * are already as small as they're going to get.
	 */
	if (buf->size <= btree->allocsize ||
	    btree->compressor == NULL ||
	    btree->compressor->compress == NULL || compressed) {
		ip = buf;
		WT_DSTAT_INCR(session, compress_write_too_small);
	} else {
		/* Skip the header bytes of the source data. */
		src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP;
		src_len = buf->size - WT_BLOCK_COMPRESS_SKIP;

		/*
		 * Compute the size needed for the destination buffer.  We only
		 * allocate enough memory for a copy of the original by default,
		 * if any compressed version is bigger than the original, we
		 * won't use it.  However, some compression engines (snappy is
		 * one example), may need more memory because they don't stop
		 * just because there's no more memory into which to compress.
		 */
		if (btree->compressor->pre_size == NULL)
			len = src_len;
		else
			WT_ERR(btree->compressor->pre_size(btree->compressor,
			    &session->iface, src, src_len, &len));

		size = len + WT_BLOCK_COMPRESS_SKIP;
		WT_ERR(bm->write_size(bm, session, &size));
		WT_ERR(__wt_scr_alloc(session, size, &tmp));

		/* Skip the header bytes of the destination data. */
		dst = (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP;
		dst_len = len;

		/*
		 * If compression fails, fallback to the original version.  This
		 * isn't unexpected: if compression doesn't work for some chunk
		 * of bytes for some reason (noting there's likely additional
		 * format/header information which compressed output requires),
		 * it just means the uncompressed version is as good as it gets,
		 * and that's what we use.
		 */
		compression_failed = 0;
		WT_ERR(btree->compressor->compress(btree->compressor,
		    &session->iface,
		    src, src_len,
		    dst, dst_len,
		    &result_len, &compression_failed));
		if (compression_failed) {
			ip = buf;
			WT_DSTAT_INCR(session, compress_write_fail);
		} else {
			compressed = 1;
			WT_DSTAT_INCR(session, compress_write);

			/*
			 * Copy in the skipped header bytes, set the final data
			 * size.
			 */
			memcpy(tmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP);
			tmp->size =
			    (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP;
			ip = tmp;
		}
	}
	dsk = ip->mem;

	/* If the buffer is compressed, set the flag. */
	if (compressed)
		F_SET(dsk, WT_PAGE_COMPRESSED);

	/*
	 * We increment the block's write generation so it's easy to identify
	 * newer versions of blocks during salvage.  (It's common in WiredTiger,
	 * at least for the default block manager, for multiple blocks to be
	 * internally consistent with identical first and last keys, so we need
	 * a way to know the most recent state of the block.  We could check
	 * which leaf is referenced by a valid internal page, but that implies
	 * salvaging internal pages, which I don't want to do, and it's not
	 * as good anyway, because the internal page may not have been written
	 * after the leaf page was updated.  So, write generations it is.
	 *
	 * Nothing is locked at this point but two versions of a page with the
	 * same generation is pretty unlikely, and if we did, they're going to
	 * be roughly identical for the purposes of salvage, anyway.
	 */
	dsk->write_gen = ++btree->write_gen;

	/*
	 * Checksum the data if the buffer isn't compressed or checksums are
	 * configured.
	 */
	switch (btree->checksum) {
	case CKSUM_ON:
		data_cksum = 1;
		break;
	case CKSUM_OFF:
		data_cksum = 0;
		break;
	case CKSUM_UNCOMPRESSED:
	default:
		data_cksum = !compressed;
		break;
	}

	/* Call the block manager to write the block. */
	WT_ERR(checkpoint ?
	    bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) :
	    bm->write(bm, session, ip, addr, addr_size, data_cksum));

	WT_CSTAT_INCR(session, cache_write);
	WT_DSTAT_INCR(session, cache_write);
	WT_CSTAT_INCRV(session, cache_bytes_write, ip->size);
	WT_DSTAT_INCRV(session, cache_bytes_write, ip->size);

err:	__wt_scr_free(&tmp);
	return (ret);
}
Пример #16
0
/*
 * __wt_bm_read --
 *	Map or read address cookie referenced block into a buffer.
 */
int
__wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session,
    WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size)
{
	WT_BLOCK *block;
	off_t offset;
	uint32_t size, cksum;
	int mapped;

	WT_UNUSED(addr_size);
	block = bm->block;

	/* Crack the cookie. */
	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));

	/*
	 * Clear buffers previously used for mapped memory, we may be forced
	 * to read into this buffer.
	 */
	if (F_ISSET(buf, WT_ITEM_MAPPED))
		__wt_buf_free(session, buf);

	/*
	 * If we're going to be able to return mapped memory and the buffer
	 * has allocated memory, discard it.
	 */
	mapped = bm->map != NULL && offset + size <= (off_t)bm->maplen;
	if (buf->mem != NULL && mapped)
		__wt_buf_free(session, buf);

	/* Map the block if it's possible. */
	if (mapped) {
		buf->mem = (uint8_t *)bm->map + offset;
		buf->memsize = size;
		buf->data = buf->mem;
		buf->size = size;
		F_SET(buf, WT_ITEM_MAPPED);

		WT_RET(__wt_mmap_preload(session, buf->mem, buf->size));

		WT_CSTAT_INCR(session, block_map_read);
		WT_CSTAT_INCRV(session, block_byte_map_read, size);
		return (0);
	}

	/* Read the block. */
	WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum));

#ifdef HAVE_POSIX_FADVISE
	/* Optionally discard blocks from the system's buffer cache. */
	if (block->os_cache_max != 0 &&
	    (block->os_cache += size) > block->os_cache_max) {
		WT_DECL_RET;

		block->os_cache = 0;
		if ((ret = posix_fadvise(block->fh->fd,
		    (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: posix_fadvise", block->name);
	}
#endif
	return (0);
}
Пример #17
0
/*
 * __wt_block_write_off --
 *	Write a buffer into a block, returning the block's addr/size and
 * checksum.
 */
int
__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_ITEM *buf, off_t *offsetp, uint32_t *sizep, uint32_t *cksump,
    int data_cksum, int locked)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_RET;
	WT_FH *fh;
	off_t offset;
	uint32_t align_size;

	blk = WT_BLOCK_HEADER_REF(buf->mem);
	fh = block->fh;

	/* Buffers should be aligned for writing. */
	if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
		WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
		WT_RET_MSG(session, EINVAL,
		    "direct I/O check: write buffer incorrectly allocated");
	}

	/*
	 * Align the size to an allocation unit.
	 *
	 * The buffer must be big enough for us to zero to the next allocsize
	 * boundary, this is one of the reasons the btree layer must find out
	 * from the block-manager layer the maximum size of the eventual write.
	 */
	align_size = (uint32_t)WT_ALIGN(buf->size, block->allocsize);
	if (align_size > buf->memsize) {
		WT_ASSERT(session, align_size <= buf->memsize);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer incorrectly allocated");
	}

	/* Zero out any unused bytes at the end of the buffer. */
	memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size);

	/*
	 * Set the disk size so we don't have to incrementally read blocks
	 * during salvage.
	 */
	blk->disk_size = align_size;

	/*
	 * Update the block's checksum: if our caller specifies, checksum the
	 * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP
	 * bytes.  The assumption is applications with good compression support
	 * turn off checksums and assume corrupted blocks won't decompress
	 * correctly.  However, if compression failed to shrink the block, the
	 * block wasn't compressed, in which case our caller will tell us to
	 * checksum the data to detect corruption.   If compression succeeded,
	 * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes
	 * because they're not compressed, both to give salvage a quick test
	 * of whether a block is useful and to give us a test so we don't lose
	 * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing.
	 */
	blk->flags = 0;
	if (data_cksum)
		F_SET(blk, WT_BLOCK_DATA_CKSUM);
	blk->cksum = 0;
	blk->cksum = __wt_cksum(
	    buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP);

	if (!locked)
		__wt_spin_lock(session, &block->live_lock);
	ret = __wt_block_alloc(session, block, &offset, (off_t)align_size);
	if (!locked)
		__wt_spin_unlock(session, &block->live_lock);
	WT_RET(ret);

#if defined(HAVE_POSIX_FALLOCATE) || defined(HAVE_FTRUNCATE)
	/*
	 * Extend the file in chunks.  We aren't holding a lock and we'd prefer
	 * to limit the number of threads extending the file at the same time,
	 * so choose the one thread that's crossing the extended boundary.  We
	 * don't extend newly created files, and it's theoretically possible we
	 * might wait so long our extension of the file is passed by another
	 * thread writing single blocks, that's why there's a check in case the
	 * extended file size becomes too small: if the file size catches up,
	 * every thread will try to extend it.
	 */
	if (fh->extend_len != 0 &&
	    (fh->extend_size <= fh->size ||
	    (offset + fh->extend_len <= fh->extend_size &&
	    offset + fh->extend_len + align_size >= fh->extend_size))) {
		fh->extend_size = offset + fh->extend_len * 2;
#if defined(HAVE_POSIX_FALLOCATE)
		if ((ret =
		    posix_fallocate(fh->fd, offset, fh->extend_len * 2)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: posix_fallocate", fh->name);
#elif defined(HAVE_FTRUNCATE)
		if ((ret = ftruncate(fh->fd, fh->extend_size)) != 0)
			WT_RET_MSG(session, ret, "%s: ftruncate", fh->name);
#endif
	}
#endif
	if ((ret =
	    __wt_write(session, fh, offset, align_size, buf->mem)) != 0) {
		if (!locked)
			__wt_spin_lock(session, &block->live_lock);
		WT_TRET(
		    __wt_block_off_free(session, block, offset, align_size));
		if (!locked)
			__wt_spin_unlock(session, &block->live_lock);
		WT_RET(ret);
	}

#ifdef HAVE_SYNC_FILE_RANGE
	/*
	 * Optionally schedule writes for dirty pages in the system buffer
	 * cache.
	 */
	if (block->os_cache_dirty_max != 0 &&
	    (block->os_cache_dirty += align_size) > block->os_cache_dirty_max) {
		block->os_cache_dirty = 0;
		if ((ret = sync_file_range(fh->fd,
		    (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: sync_file_range", block->name);
	}
#endif
#ifdef HAVE_POSIX_FADVISE
	/* Optionally discard blocks from the system buffer cache. */
	if (block->os_cache_max != 0 &&
	    (block->os_cache += align_size) > block->os_cache_max) {
		block->os_cache = 0;
		if ((ret = posix_fadvise(fh->fd,
		    (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: posix_fadvise", block->name);
	}
#endif
	WT_CSTAT_INCR(session, block_write);
	WT_CSTAT_INCRV(session, block_byte_write, align_size);

	WT_VERBOSE_RET(session, write,
	    "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32,
	    (uintmax_t)offset, align_size, blk->cksum);

	*offsetp = offset;
	*sizep = align_size;
	*cksump = blk->cksum;

	return (ret);
}
Пример #18
0
/*
 * __wt_realloc_aligned --
 *	ANSI realloc function that aligns to buffer boundaries, configured with
 *	the "buffer_alignment" key to wiredtiger_open.
 */
int
__wt_realloc_aligned(WT_SESSION_IMPL *session,
    size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
{
#if defined(HAVE_POSIX_MEMALIGN)
	int ret;

	/*
	 * !!!
	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
	 */
	if (session != NULL && S2C(session)->buffer_alignment > 0) {
		void *p, *newp;
		size_t bytes_allocated;

		WT_ASSERT(session, bytes_to_allocate != 0);

		/*
		 * Sometimes we're allocating memory and we don't care about the
		 * final length -- bytes_allocated_ret may be NULL.
		 */
		bytes_allocated = (bytes_allocated_ret == NULL) ?
		    0 : *bytes_allocated_ret;
		WT_ASSERT(session, bytes_allocated < bytes_to_allocate);

		p = *(void **)retp;

		WT_ASSERT(session, p == NULL || bytes_allocated != 0);

		if (p == NULL && session != NULL && S2C(session)->stats != NULL)
			WT_CSTAT_INCR(session, memalloc);

		if ((ret = posix_memalign(&newp,
		    S2C(session)->buffer_alignment,
		    bytes_to_allocate)) != 0)
			WT_RET_MSG(session, ret, "memory allocation");

		if (p != NULL)
			memcpy(newp, p, bytes_allocated);
		__wt_free(session, p);
		p = newp;

		/* Clear the allocated memory (see above). */
		memset((uint8_t *)p + bytes_allocated, 0,
		    bytes_to_allocate - bytes_allocated);

		/* Update caller's bytes allocated value. */
		if (bytes_allocated_ret != NULL)
			*bytes_allocated_ret = bytes_to_allocate;

		*(void **)retp = p;
		return (0);
	}
#endif
	/*
	 * If there is no posix_memalign function, or no alignment configured,
	 * fall back to realloc.
	 */
	return (__wt_realloc(
	    session, bytes_allocated_ret, bytes_to_allocate, retp));
}
Пример #19
0
/*
 * __wt_merge_tree --
 *	Attempt to collapse a stack of split-merge pages in memory into a
 *	shallow tree.  If enough keys are found, create a real internal node
 *	that can be evicted (and, if necessary, split further).
 *
 *	This code is designed to deal with workloads that otherwise create
 *	arbitrarily deep (and slow) trees in memory.
 */
int
__wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
{
	WT_DECL_RET;
	WT_PAGE *lchild, *newtop, *rchild;
	WT_REF *newref;
	WT_VISIT_STATE visit_state;
	uint32_t refcnt, split;
	int promote;
	u_int levels;
	uint8_t page_type;

	WT_CLEAR(visit_state);
	visit_state.session = session;
	lchild = newtop = rchild = NULL;
	page_type = top->type;

	WT_ASSERT(session, __wt_btree_mergeable(top));
	WT_ASSERT(session, top->ref->state == WT_REF_LOCKED);

	/*
	 * Walk the subtree, count the references at the bottom level and
	 * calculate the maximum depth.
	 */
	WT_RET(__merge_walk(session, top, 1, __merge_count, &visit_state));

	/* If there aren't enough useful levels, give up. */
	if (visit_state.maxdepth < WT_MERGE_STACK_MIN)
		return (EBUSY);

	/*
	 * Don't allow split merges to generate arbitrarily large pages.
	 * Ideally we would choose a size based on the internal_page_max
	 * setting for the btree, but we don't have the correct btree handle
	 * available.
	 */
	if (visit_state.refcnt > WT_MERGE_MAX_REFS)
		return (EBUSY);

	/*
	 * Now we either collapse the internal pages into one split-merge page,
	 * or if there are "enough" keys, we split into two equal internal
	 * pages, each of which can be evicted independently.
	 *
	 * We set a flag (WT_PM_REC_SPLIT_MERGE) on the created page if it
	 * isn't big enough to justify the cost of evicting it.  If splits
	 * continue, it will be merged again until it gets over this limit.
	 */
	promote = 0;
	refcnt = (uint32_t)visit_state.refcnt;
	if (refcnt >= WT_MERGE_FULL_PAGE && visit_state.seen_live) {
		/*
		 * In the normal case where there are live children spread
		 * through the subtree, create two child pages.
		 *
		 * Handle the case where the only live child is first / last
		 * specially: put the live child into the top-level page.
		 *
		 * Set SPLIT_MERGE on the internal pages if there are any live
		 * children: they can't be evicted, so there is no point
		 * permanently deepening the tree.
		 */
		if (visit_state.first_live == visit_state.last_live &&
		    (visit_state.first_live == 0 ||
		    visit_state.first_live == refcnt - 1))
			split = (visit_state.first_live == 0) ? 1 : refcnt - 1;
		else
			split = (refcnt + 1) / 2;

		/* Only promote if we can create a real page. */
		if (split == 1 || split == refcnt - 1)
			promote = 1;
		else if (split >= WT_MERGE_FULL_PAGE &&
		    visit_state.first_live >= split)
			promote = 1;
		else if (refcnt - split >= WT_MERGE_FULL_PAGE &&
		    visit_state.last_live < split)
			promote = 1;
	}

	if (promote) {
		/* Create a new top-level split-merge page with two entries. */
		WT_ERR(__merge_new_page(session, page_type, 2, 1, &newtop));

		visit_state.split = split;

		/* Left split. */
		if (split == 1)
			visit_state.first = newtop;
		else {
			WT_ERR(__merge_new_page(session, page_type, split,
			    visit_state.first_live < split, &lchild));
			visit_state.first = lchild;
		}

		/* Right split. */
		if (split == refcnt - 1) {
			visit_state.second = newtop;
			visit_state.second_ref = &newtop->u.intl.t[1];
		} else {
			WT_ERR(__merge_new_page(session, page_type,
			    refcnt - split, visit_state.last_live >= split,
			    &rchild));
			visit_state.second = rchild;
			visit_state.second_ref =
			    &visit_state.second->u.intl.t[0];
		}
	} else {
		/*
		 * Create a new split-merge page for small merges, or if the
		 * page above is a split merge page.  When we do a big enough
		 * merge, we create a real page at the top and don't consider
		 * it as a merge candidate again.  Over time with an insert
		 * workload the tree will grow deeper, but that's inevitable,
		 * and this keeps individual merges small.
		 */
		WT_ERR(__merge_new_page(session, page_type, refcnt,
		    refcnt < WT_MERGE_FULL_PAGE ||
		    __wt_btree_mergeable(top->parent),
		    &newtop));

		visit_state.first = newtop;
	}

	/*
	 * Copy the references into the new tree, but don't update anything in
	 * the locked tree in case there is an error and we need to back out.
	 * We do this in a separate pass so that we can figure out the key for
	 * the split point: that allocates memory and so it could still fail.
	 */
	visit_state.page = visit_state.first;
	visit_state.ref = visit_state.page->u.intl.t;
	visit_state.refcnt = 0;
	WT_ERR(__merge_walk(session, top, 0, __merge_copy_ref, &visit_state));

	if (promote) {
		/* Promote keys into the top-level page. */
		if (lchild != NULL) {
			newref = &newtop->u.intl.t[0];
			WT_LINK_PAGE(newtop, newref, lchild);
			newref->state = WT_REF_MEM;
			WT_ERR(__merge_promote_key(session, newref));
		}

		if (rchild != NULL) {
			newref = &newtop->u.intl.t[1];
			WT_LINK_PAGE(newtop, newref, rchild);
			newref->state = WT_REF_MEM;
			WT_ERR(__merge_promote_key(session, newref));
		}
	}

	/*
	 * We have copied everything into place and allocated all of the memory
	 * we need.  Now link all pages into the new tree and unlock them.
	 *
	 * The only way this could fail is if a reference state has been
	 * changed by another thread since they were locked.  Panic in that
	 * case: that should never happen.
	 */
	visit_state.page = visit_state.first;
	visit_state.ref = visit_state.page->u.intl.t;
	visit_state.refcnt = 0;
	ret = __merge_walk(session, top, 0, __merge_switch_page, &visit_state);

	if (ret != 0)
		WT_ERR(__wt_illegal_value(session, "__wt_merge_tree"));

	newtop->u.intl.recno = top->u.intl.recno;
	newtop->parent = top->parent;
	newtop->ref = top->ref;

#ifdef HAVE_DIAGNOSTIC
	/*
	 * Before swapping in the new tree, walk the pages we are discarding,
	 * check that everything looks right.
	 */
	__merge_check_discard(session, top);
#endif

	/*
	 * Set up the new top-level page as a split so that it will be swapped
	 * into place by our caller.
	 */
	top->modify->flags = WT_PM_REC_SPLIT;
	top->modify->u.split = newtop;

	WT_VERBOSE_ERR(session, evict,
	    "Successfully %s %" PRIu32
	    " split-merge pages containing %" PRIu32 " keys\n",
	    promote ? "promoted" : "merged", visit_state.maxdepth, refcnt);

	/* Evict new child pages as soon as possible. */
	if (lchild != NULL && !F_ISSET(lchild->modify, WT_PM_REC_SPLIT_MERGE))
		lchild->read_gen = WT_READ_GEN_OLDEST;
	if (rchild != NULL && !F_ISSET(rchild->modify, WT_PM_REC_SPLIT_MERGE))
		rchild->read_gen = WT_READ_GEN_OLDEST;

	/* Update statistics. */
	WT_CSTAT_INCR(session, cache_eviction_merge);
	WT_DSTAT_INCR(session, cache_eviction_merge);

	/* How many levels did we remove? */
	levels = visit_state.maxdepth - (promote ? 2 : 1);
	WT_CSTAT_INCRV(session, cache_eviction_merge_levels, levels);
	WT_DSTAT_INCRV(session, cache_eviction_merge_levels, levels);

	return (0);

err:	WT_VERBOSE_TRET(session, evict,
	    "Failed to merge %" PRIu32
	    " split-merge pages containing %" PRIu32 " keys\n",
	    visit_state.maxdepth, refcnt);

	WT_CSTAT_INCR(session, cache_eviction_merge_fail);
	WT_DSTAT_INCR(session, cache_eviction_merge_fail);

	if (newtop != NULL)
		__wt_page_out(session, &newtop);
	if (lchild != NULL)
		__wt_page_out(session, &lchild);
	if (rchild != NULL)
		__wt_page_out(session, &rchild);
	return (ret);
}
Пример #20
0
/*
 * __wt_block_read_off --
 *	Read an addr/size pair referenced block into a buffer.
 */
int
__wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_PAGE_HEADER *dsk;
	size_t result_len;
	uint32_t page_cksum;

	WT_VERBOSE_RET(session, read,
	    "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32,
	    (uintmax_t)offset, size, cksum);

#ifdef HAVE_DIAGNOSTIC
	/*
	 * In diagnostic mode, verify the block we're about to read isn't on
	 * either the available or discard lists.
	 *
	 * Don't check during salvage, it's possible we're reading an already
	 * freed overflow page.
	 */
	if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR))
		WT_RET(
		    __wt_block_misplaced(session, block, "read", offset, size));
#endif

	/*
	 * If we're compressing the file blocks, place the initial read into a
	 * scratch buffer, we're going to have to re-allocate more memory for
	 * decompression.  Else check the caller's buffer size and grow it as
	 * necessary, there will only be one buffer.
	 */
	if (block->compressor == NULL) {
		F_SET(buf, WT_ITEM_ALIGNED);
		WT_RET(__wt_buf_init(session, buf, size));
		buf->size = size;
		dsk = buf->mem;
	} else {
		WT_RET(__wt_scr_alloc(session, size, &tmp));
		tmp->size = size;
		dsk = tmp->mem;
	}

	/* Read. */
	WT_ERR(__wt_read(session, block->fh, offset, size, dsk));
	blk = WT_BLOCK_HEADER_REF(dsk);

	/* Validate the checksum. */
	if (block->checksum &&
	    cksum != WT_BLOCK_CHECKSUM_NOT_SET &&
	    blk->cksum != WT_BLOCK_CHECKSUM_NOT_SET) {
		blk->cksum = 0;
		page_cksum = __wt_cksum(dsk, size);
		if (page_cksum == WT_BLOCK_CHECKSUM_NOT_SET)
			++page_cksum;
		if (cksum != page_cksum) {
			if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR))
				__wt_errx(session,
				    "read checksum error [%"
				    PRIu32 "B @ %" PRIuMAX ", %"
				    PRIu32 " != %" PRIu32 "]",
				    size, (uintmax_t)offset, cksum, page_cksum);
			WT_ERR(WT_ERROR);
		}
	}

	/*
	 * If the in-memory block size is larger than the on-disk block size,
	 * the block is compressed.   Size the user's buffer, copy the skipped
	 * bytes of the original image into place, then decompress.
	 *
	 * If the in-memory block size is less than or equal to the on-disk
	 * block size, the block is not compressed.
	 */
	if (blk->disk_size < dsk->size) {
		if (block->compressor == NULL)
			WT_ERR(__wt_illegal_value(session, block->name));

		WT_ERR(__wt_buf_init(session, buf, dsk->size));
		buf->size = dsk->size;

		/*
		 * Note the source length is NOT the number of compressed bytes,
		 * it's the length of the block we just read (minus the skipped
		 * bytes).  We don't store the number of compressed bytes: some
		 * compression engines need that length stored externally, they
		 * don't have markers in the stream to signal the end of the
		 * compressed bytes.  Those engines must store the compressed
		 * byte length somehow, see the snappy compression extension for
		 * an example.
		 */
		memcpy(buf->mem, tmp->mem, WT_BLOCK_COMPRESS_SKIP);
		WT_ERR(block->compressor->decompress(
		    block->compressor, &session->iface,
		    (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP,
		    tmp->size - WT_BLOCK_COMPRESS_SKIP,
		    (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP,
		    dsk->size - WT_BLOCK_COMPRESS_SKIP,
		    &result_len));
		if (result_len != dsk->size - WT_BLOCK_COMPRESS_SKIP)
			WT_ERR(__wt_illegal_value(session, block->name));
	} else
		if (block->compressor == NULL)
			buf->size = dsk->size;
		else
			/*
			 * We guessed wrong: there was a compressor, but this
			 * block was not compressed, and now the page is in the
			 * wrong buffer and the buffer may be of the wrong size.
			 * This should be rare, why configure a compressor that
			 * doesn't work?  Allocate a buffer of the right size
			 * (we used a scratch buffer which might be large), and
			 * copy the data into place.
			 */
			WT_ERR(
			    __wt_buf_set(session, buf, tmp->data, dsk->size));

	WT_BSTAT_INCR(session, page_read);
	WT_CSTAT_INCR(session, block_read);

err:	__wt_scr_free(&tmp);
	return (ret);
}