Example #1
0
/*
 * __cursor_key_order_check_col --
 *	Check key ordering for column-store cursor movements.
 */
static int
__cursor_key_order_check_col(
    WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next)
{
	int cmp;

	cmp = 0;			/* -Werror=maybe-uninitialized */

	if (cbt->lastrecno != WT_RECNO_OOB) {
		if (cbt->lastrecno < cbt->recno)
			cmp = -1;
		if (cbt->lastrecno > cbt->recno)
			cmp = 1;
	}

	if (cbt->lastrecno == WT_RECNO_OOB ||
	    (next && cmp < 0) || (!next && cmp > 0)) {
		cbt->lastrecno = cbt->recno;
		return (0);
	}

	WT_PANIC_RET(session, EINVAL,
	    "WT_CURSOR.%s out-of-order returns: returned key %" PRIu64 " then "
	    "key %" PRIu64,
	    next ? "next" : "prev", cbt->lastrecno, cbt->recno);
}
Example #2
0
/*
 * __wt_lsm_tree_readunlock --
 *	Release a shared lock on an LSM tree.
 */
int
__wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;

	F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);

	if ((ret = __wt_readunlock(session, lsm_tree->rwlock)) != 0)
		WT_PANIC_RET(session, ret, "Unlocking an LSM tree");
	return (0);
}
Example #3
0
/*
 * __wt_hazard_clear --
 *	Clear a hazard pointer.
 */
int
__wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_BTREE *btree;
	WT_HAZARD *hp;

	btree = S2BT(session);

	/* If a file can never be evicted, hazard pointers aren't required. */
	if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
		return (0);

	/*
	 * Clear the caller's hazard pointer.
	 * The common pattern is LIFO, so do a reverse search.
	 */
	for (hp = session->hazard + session->hazard_size - 1;
	    hp >= session->hazard;
	    --hp)
		if (hp->page == page) {
			/*
			 * We don't publish the hazard pointer clear in the
			 * general case.  It's not required for correctness;
			 * it gives an eviction thread faster access to the
			 * page were the page selected for eviction, but the
			 * generation number was just set, it's unlikely the
			 * page will be selected for eviction.
			 */
			hp->page = NULL;

			/*
			 * If this was the last hazard pointer in the session,
			 * reset the size so that checks can skip this session.
			 */
			if (--session->nhazard == 0)
				WT_PUBLISH(session->hazard_size, 0);
			return (0);
		}

	/*
	 * A serious error, we should always find the hazard pointer.  Panic,
	 * because using a page we didn't have pinned down implies corruption.
	 */
	WT_PANIC_RET(session, EINVAL,
	    "session %p: clear hazard pointer: %p: not found",
	    (void *)session, (void *)page);
}
Example #4
0
/*
 * __wt_lsm_tree_switch --
 *	Switch to a new in-memory tree.
 */
int
__wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	uint32_t nchunks, new_id;
	int first_switch;

	WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));

	nchunks = lsm_tree->nchunks;

	first_switch = nchunks == 0 ? 1 : 0;
	/*
	 * Check if a switch is still needed: we may have raced while waiting
	 * for a lock.
	 */
	chunk = NULL;
	if (!first_switch &&
	    (chunk = lsm_tree->chunk[nchunks - 1]) != NULL &&
	    !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
	    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
		goto err;

	/* Set the switch transaction in the previous chunk, if necessary. */
	if (chunk != NULL && chunk->switch_txn == WT_TXN_NONE)
		chunk->switch_txn = __wt_txn_new_id(session);

	/* Update the throttle time. */
	__wt_lsm_tree_throttle(session, lsm_tree, 0);

	new_id = WT_ATOMIC_ADD4(lsm_tree->last, 1);

	WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc,
	    nchunks + 1, &lsm_tree->chunk));

	WT_ERR(__wt_verbose(session, WT_VERB_LSM,
	    "Tree %s switch to: %" PRIu32 ", checkpoint throttle %ld, "
	    "merge throttle %ld", lsm_tree->name,
	    new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle));

	WT_ERR(__wt_calloc_def(session, 1, &chunk));
	chunk->id = new_id;
	chunk->switch_txn = WT_TXN_NONE;
	lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));

	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
	F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
	++lsm_tree->dsk_gen;

	lsm_tree->modified = 1;

err:	WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
	/*
	 * Errors that happen during a tree switch leave the tree in a state
	 * where we can't make progress. Error out of WiredTiger.
	 */
	if (ret != 0)
		WT_PANIC_RET(session, ret, "Failed doing LSM switch");
	else if (!first_switch)
		WT_RET(__wt_lsm_manager_push_entry(
		    session, WT_LSM_WORK_FLUSH, 0, lsm_tree));
	return (ret);
}
Example #5
0
/*
 * __wt_block_read_off --
 *	Read an addr/size pair referenced block into a buffer.
 */
int
__wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t checksum)
{
	WT_BLOCK_HEADER *blk, swap;
	size_t bufsize;
	uint32_t page_checksum;

	__wt_verbose(session, WT_VERB_READ,
	    "off %" PRIuMAX ", size %" PRIu32 ", checksum %#" PRIx32,
	    (uintmax_t)offset, size, checksum);

	WT_STAT_CONN_INCR(session, block_read);
	WT_STAT_CONN_INCRV(session, block_byte_read, size);

	/*
	 * Grow the buffer as necessary and read the block.  Buffers should be
	 * aligned for reading, but there are lots of buffers (for example, file
	 * cursors have two buffers each, key and value), and it's difficult to
	 * be sure we've found all of them.  If the buffer isn't aligned, it's
	 * an easy fix: set the flag and guarantee we reallocate it.  (Most of
	 * the time on reads, the buffer memory has not yet been allocated, so
	 * we're not adding any additional processing time.)
	 */
	if (F_ISSET(buf, WT_ITEM_ALIGNED))
		bufsize = size;
	else {
		F_SET(buf, WT_ITEM_ALIGNED);
		bufsize = WT_MAX(size, buf->memsize + 10);
	}
	WT_RET(__wt_buf_init(session, buf, bufsize));
	WT_RET(__wt_read(session, block->fh, offset, size, buf->mem));
	buf->size = size;

	/*
	 * We incrementally read through the structure before doing a checksum,
	 * do little- to big-endian handling early on, and then select from the
	 * original or swapped structure as needed.
	 */
	blk = WT_BLOCK_HEADER_REF(buf->mem);
	__wt_block_header_byteswap_copy(blk, &swap);
	if (swap.checksum == checksum) {
		blk->checksum = 0;
		page_checksum = __wt_checksum(buf->mem,
		    F_ISSET(&swap, WT_BLOCK_DATA_CKSUM) ?
		    size : WT_BLOCK_COMPRESS_SKIP);
		if (page_checksum == checksum) {
			/*
			 * Swap the page-header as needed; this doesn't belong
			 * here, but it's the best place to catch all callers.
			 */
			__wt_page_header_byteswap(buf->mem);
			return (0);
		}

		if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
			__wt_errx(session,
			    "%s: read checksum error for %" PRIu32 "B block at "
			    "offset %" PRIuMAX ": calculated block checksum "
			    "of %#" PRIx32 " doesn't match expected checksum "
			    "of %#" PRIx32,
			    block->name,
			    size, (uintmax_t)offset, page_checksum, checksum);
	} else
		if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
			__wt_errx(session,
			    "%s: read checksum error for %" PRIu32 "B block at "
			    "offset %" PRIuMAX ": block header checksum "
			    "of %#" PRIx32 " doesn't match expected checksum "
			    "of %#" PRIx32,
			    block->name,
			    size, (uintmax_t)offset, swap.checksum, checksum);

	if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
		WT_IGNORE_RET(
		    __wt_bm_corrupt_dump(session, buf, offset, size, checksum));

	/* Panic if a checksum fails during an ordinary read. */
	F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
	if (block->verify || F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
		return (WT_ERROR);
	WT_PANIC_RET(session, WT_ERROR, "%s: fatal read error", block->name);
}
Example #6
0
/*
 * __wt_meta_track_off --
 *	Turn off metadata operation tracking, unrolling on error.
 */
int
__wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll)
{
	WT_DECL_RET;
	WT_META_TRACK *trk, *trk_orig;
	WT_SESSION_IMPL *ckpt_session;
	int saved_ret;
	bool did_drop;

	saved_ret = 0;

	WT_ASSERT(session,
	    WT_META_TRACKING(session) && session->meta_track_nest > 0);

	trk_orig = session->meta_track;
	trk = session->meta_track_next;

	/* If it was a nested transaction, there is nothing to do. */
	if (--session->meta_track_nest != 0)
		return (0);

	/* Turn off tracking for unroll. */
	session->meta_track_next = session->meta_track_sub = NULL;

	/*
	 * If there were no operations logged, skip unnecessary metadata
	 * checkpoints.  For example, this happens if attempting to create a
	 * data source that already exists (or drop one that doesn't).
	 */
	if (trk == trk_orig)
		goto err;

	/* Unrolling doesn't require syncing the metadata. */
	if (unroll)
		goto err;

	if (F_ISSET(session, WT_SESSION_SCHEMA_TXN)) {
		F_CLR(session, WT_SESSION_SCHEMA_TXN);
#ifdef WT_ENABLE_SCHEMA_TXN
		WT_ERR(__wt_txn_commit(session, NULL));
		__wt_errx(session, "TRACK: Commit internal schema txn");
#endif
	}

	/*
	 * If we don't have the metadata cursor (e.g, we're in the process of
	 * creating the metadata), we can't sync it.
	 */
	if (!need_sync || session->meta_cursor == NULL ||
	    F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
		goto err;

	/* If we're logging, make sure the metadata update was flushed. */
	if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED))
		WT_WITH_DHANDLE(session,
		    WT_SESSION_META_DHANDLE(session),
		    ret = __wt_txn_checkpoint_log(
		    session, false, WT_TXN_LOG_CKPT_SYNC, NULL));
	else {
		WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
		ckpt_session = S2C(session)->meta_ckpt_session;
		/*
		 * If this operation is part of a running transaction, that
		 * should be included in the checkpoint.
		 */
		ckpt_session->txn.id = session->txn.id;
		WT_ASSERT(session,
		    !F_ISSET(session, WT_SESSION_LOCKED_METADATA));
		WT_WITH_DHANDLE(ckpt_session, WT_SESSION_META_DHANDLE(session),
		    WT_WITH_METADATA_LOCK(ckpt_session,
			ret = __wt_checkpoint(ckpt_session, NULL)));
		ckpt_session->txn.id = WT_TXN_NONE;
		if (ret == 0)
			WT_WITH_DHANDLE(session,
			    WT_SESSION_META_DHANDLE(session),
			    ret = __wt_checkpoint_sync(session, NULL));
	}

err:	/*
	 * Undo any tracked operations on failure.
	 * Apply any tracked operations post-commit.
	 */
	did_drop = false;
	if (unroll || ret != 0) {
		saved_ret = ret;
		ret = 0;
		while (--trk >= trk_orig) {
			did_drop = did_drop || trk->op == WT_ST_DROP_COMMIT;
			WT_TRET(__meta_track_unroll(session, trk));
		}
	} else
		for (; trk_orig < trk; trk_orig++) {
			did_drop = did_drop ||
			    trk_orig->op == WT_ST_DROP_COMMIT;
			WT_TRET(__meta_track_apply(session, trk_orig));
		}

	if (F_ISSET(session, WT_SESSION_SCHEMA_TXN)) {
		F_CLR(session, WT_SESSION_SCHEMA_TXN);
		/*
		 * We should have committed above unless we're unrolling, there
		 * was an error or the operation was a noop.
		 */
		WT_ASSERT(session, unroll || saved_ret != 0 ||
		    session->txn.mod_count == 0);
#ifdef WT_ENABLE_SCHEMA_TXN
		__wt_err(session, saved_ret,
		    "TRACK: Abort internal schema txn");
		WT_TRET(__wt_txn_rollback(session, NULL));
#endif
	}

	/*
	 * Wake up the sweep thread: particularly for the in-memory
	 * storage engine, we want to reclaim space immediately.
	 */
	if (did_drop && S2C(session)->sweep_cond != NULL)
		__wt_cond_signal(session, S2C(session)->sweep_cond);

	if (ret != 0)
		WT_PANIC_RET(session, ret,
		    "failed to apply or unroll all tracked operations");
	return (saved_ret == 0 ? 0 : saved_ret);
}
Example #7
0
/*
 * __thread_group_resize --
 *	Resize an array of utility threads already holding the lock.
 */
static int
__thread_group_resize(
    WT_SESSION_IMPL *session, WT_THREAD_GROUP *group,
    uint32_t new_min, uint32_t new_max, uint32_t flags)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_THREAD *thread;
	size_t alloc;
	uint32_t i, session_flags;

	conn = S2C(session);
	session_flags = 0;

	WT_ASSERT(session,
	    group->current_threads <= group->alloc &&
	    __wt_rwlock_islocked(session, group->lock));

	if (new_min == group->min && new_max == group->max)
		return (0);

	/*
	 * Coll shrink to reduce the number of thread structures and running
	 * threads if required by the change in group size.
	 */
	WT_RET(__thread_group_shrink(session, group, new_max));

	/*
	 * Only reallocate the thread array if it is the largest ever, since
	 * our realloc doesn't support shrinking the allocated size.
	 */
	if (group->alloc < new_max) {
		alloc = group->alloc * sizeof(*group->threads);
		WT_RET(__wt_realloc(session, &alloc,
		    new_max * sizeof(*group->threads), &group->threads));
		group->alloc = new_max;
	}

	/*
	 * Initialize the structures based on the previous group size, not
	 * the previous allocated size.
	 */
	for (i = group->max; i < new_max; i++) {
		WT_ERR(__wt_calloc_one(session, &thread));
		/*
		 * Threads get their own session and lookaside table cursor
		 * if the lookaside table is open. Note that threads are
		 * started during recovery, before the lookaside table is
		 * created.
		 */
		if (LF_ISSET(WT_THREAD_CAN_WAIT))
			session_flags = WT_SESSION_CAN_WAIT;
		if (F_ISSET(conn, WT_CONN_LAS_OPEN))
			FLD_SET(session_flags, WT_SESSION_LOOKASIDE_CURSOR);
		WT_ERR(__wt_open_internal_session(conn, group->name,
		    false, session_flags, &thread->session));
		if (LF_ISSET(WT_THREAD_PANIC_FAIL))
			F_SET(thread, WT_THREAD_PANIC_FAIL);
		thread->id = i;
		thread->run_func = group->run_func;
		WT_ASSERT(session, group->threads[i] == NULL);
		group->threads[i] = thread;
	}

	if (group->current_threads < new_min)
		WT_ERR(__thread_group_grow(session, group, new_min));

err:	/*
	 * Update the thread group information even on failure to improve our
	 * chances of cleaning up properly.
	 */
	group->max = new_max;
	group->min = new_min;

	/*
	 * An error resizing a thread array is fatal, it should only happen
	 * in an out of memory situation.
	 */
	if (ret != 0) {
		WT_TRET(__wt_thread_group_destroy(session, group));
		WT_PANIC_RET(session, ret, "Error while resizing thread group");
	}
	return (ret);
}