示例#1
0
文件: rec_track.c 项目: Andiry/mongo
/*
 * __wt_ovfl_txnc_add --
 *	Add a new entry to the page's list of transaction-cached overflow
 * records.
 */
int
__wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page,
    const uint8_t *addr, size_t addr_size,
    const void *value, size_t value_size)
{
	WT_OVFL_TXNC **head, **stack[WT_SKIP_MAXDEPTH], *txnc;
	size_t size;
	u_int i, skipdepth;
	uint8_t *p;

	if (page->modify->ovfl_track == NULL)
		WT_RET(__ovfl_track_init(session, page));

	head = page->modify->ovfl_track->ovfl_txnc;

	/* Choose a skiplist depth for this insert. */
	skipdepth = __wt_skip_choose_depth(session);

	/*
	 * Allocate the WT_OVFL_TXNC structure, next pointers for the skip
	 * list, room for the address and value, then copy everything into
	 * place.
	 *
	 * To minimize the WT_OVFL_TXNC structure size, the address offset
	 * and size are single bytes: that's safe because the address follows
	 * the structure (which can't be more than about 100B), and address
	 * cookies are limited to 255B.
	 */
	size = sizeof(WT_OVFL_TXNC) +
	    skipdepth * sizeof(WT_OVFL_TXNC *) + addr_size + value_size;
	WT_RET(__wt_calloc(session, 1, size, &txnc));
	p = (uint8_t *)txnc +
	    sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *);
	txnc->addr_offset = (uint8_t)WT_PTRDIFF(p, txnc);
	txnc->addr_size = (uint8_t)addr_size;
	memcpy(p, addr, addr_size);
	p += addr_size;
	txnc->value_offset = WT_PTRDIFF32(p, txnc);
	txnc->value_size = WT_STORE_SIZE(value_size);
	memcpy(p, value, value_size);
	txnc->current = __wt_txn_new_id(session);

	__wt_cache_page_inmem_incr(
	    session, page, WT_OVFL_SIZE(txnc, WT_OVFL_TXNC));

	/* Insert the new entry into the skiplist. */
	__ovfl_txnc_skip_search_stack(head, stack, addr, addr_size);
	for (i = 0; i < skipdepth; ++i) {
		txnc->next[i] = *stack[i];
		*stack[i] = txnc;
	}

	if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
		WT_RET(__ovfl_txnc_verbose(session, page, txnc, "add"));

	return (0);
}
示例#2
0
/*
 * __wt_lsm_tree_switch --
 *	Switch to a new in-memory tree.
 */
int
__wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	uint32_t nchunks, new_id;
	int first_switch;

	WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));

	nchunks = lsm_tree->nchunks;

	first_switch = nchunks == 0 ? 1 : 0;
	/*
	 * Check if a switch is still needed: we may have raced while waiting
	 * for a lock.
	 */
	chunk = NULL;
	if (!first_switch &&
	    (chunk = lsm_tree->chunk[nchunks - 1]) != NULL &&
	    !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
	    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
		goto err;

	/* Set the switch transaction in the previous chunk, if necessary. */
	if (chunk != NULL && chunk->switch_txn == WT_TXN_NONE)
		chunk->switch_txn = __wt_txn_new_id(session);

	/* Update the throttle time. */
	__wt_lsm_tree_throttle(session, lsm_tree, 0);

	new_id = WT_ATOMIC_ADD4(lsm_tree->last, 1);

	WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc,
	    nchunks + 1, &lsm_tree->chunk));

	WT_ERR(__wt_verbose(session, WT_VERB_LSM,
	    "Tree %s switch to: %" PRIu32 ", checkpoint throttle %ld, "
	    "merge throttle %ld", lsm_tree->name,
	    new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle));

	WT_ERR(__wt_calloc_def(session, 1, &chunk));
	chunk->id = new_id;
	chunk->switch_txn = WT_TXN_NONE;
	lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));

	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
	F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
	++lsm_tree->dsk_gen;

	lsm_tree->modified = 1;

err:	WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
	/*
	 * Errors that happen during a tree switch leave the tree in a state
	 * where we can't make progress. Error out of WiredTiger.
	 */
	if (ret != 0)
		WT_PANIC_RET(session, ret, "Failed doing LSM switch");
	else if (!first_switch)
		WT_RET(__wt_lsm_manager_push_entry(
		    session, WT_LSM_WORK_FLUSH, 0, lsm_tree));
	return (ret);
}
示例#3
0
/*
 * __wt_lsm_compact --
 *	Compact an LSM tree called via __wt_schema_worker.
 */
int
__wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	time_t begin, end;
	uint64_t progress;
	int i, compacting, flushing, locked, ref;

	compacting = flushing = locked = ref = 0;
	chunk = NULL;
	/*
	 * This function is applied to all matching sources: ignore anything
	 * that is not an LSM tree.
	 */
	if (!WT_PREFIX_MATCH(name, "lsm:"))
		return (0);

	/* Tell __wt_schema_worker not to look inside the LSM tree. */
	*skip = 1;

	WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree));

	if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
		WT_ERR_MSG(session, EINVAL,
		    "LSM compaction requires active merge threads");

	WT_ERR(__wt_seconds(session, &begin));

	/*
	 * Compacting has two distinct phases.
	 * 1.  All in-memory chunks up to and including the current
	 * current chunk must be flushed.  Normally, the flush code
	 * does not flush the last, in-use chunk, so we set a force
	 * flag to include that last chunk.  We monitor the state of the
	 * last chunk and periodically push another forced flush work
	 * unit until it is complete.
	 * 2.  After all flushing is done, we move onto the merging
	 * phase for compaction.  Again, we monitor the state and
	 * continue to push merge work units until all merging is done.
	 */

	/* Lock the tree: single-thread compaction. */
	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
	locked = 1;

	/* Clear any merge throttle: compact throws out that calculation. */
	lsm_tree->merge_throttle = 0;
	lsm_tree->merge_aggressiveness = 0;
	progress = lsm_tree->merge_progressing;

	/* If another thread started a compact on this tree, we're done. */
	if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING))
		goto err;

	/*
	 * Set the switch transaction on the current chunk, if it
	 * hasn't been set before.  This prevents further writes, so it
	 * can be flushed by the checkpoint worker.
	 */
	if (lsm_tree->nchunks > 0 &&
	    (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL) {
		if (chunk->switch_txn == WT_TXN_NONE)
			chunk->switch_txn = __wt_txn_new_id(session);
		/*
		 * If we have a chunk, we want to look for it to be on-disk.
		 * So we need to add a reference to keep it available.
		 */
		(void)WT_ATOMIC_ADD4(chunk->refcnt, 1);
		ref = 1;
	}

	locked = 0;
	WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));

	if (chunk != NULL) {
		WT_ERR(__wt_verbose(session, WT_VERB_LSM,
		    "Compact force flush %s flags 0x%" PRIx32
		    " chunk %u flags 0x%"
		    PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags));
		flushing = 1;
		/*
		 * Make sure the in-memory chunk gets flushed do not push a
		 * switch, because we don't want to create a new in-memory
		 * chunk if the tree is being used read-only now.
		 */
		WT_ERR(__wt_lsm_manager_push_entry(session,
		    WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree));
	} else {
		/*
		 * If there is no chunk to flush, go straight to the
		 * compacting state.
		 */
		compacting = 1;
		progress = lsm_tree->merge_progressing;
		F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
		WT_ERR(__wt_verbose(session, WT_VERB_LSM,
		    "COMPACT: Start compacting %s", lsm_tree->name));
	}

	/* Wait for the work unit queues to drain. */
	while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) {
		/*
		 * The flush flag is cleared when the chunk has been flushed.
		 * Continue to push forced flushes until the chunk is on disk.
		 * Once it is on disk move to the compacting phase.
		 */
		if (flushing) {
			WT_ASSERT(session, chunk != NULL);
			if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
				WT_ERR(__wt_verbose(session,
				    WT_VERB_LSM,
				    "Compact flush done %s chunk %u.  "
				    "Start compacting progress %" PRIu64,
				    name, chunk->id,
				    lsm_tree->merge_progressing));
				(void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
				flushing = ref = 0;
				compacting = 1;
				F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
				progress = lsm_tree->merge_progressing;
			} else {
				WT_ERR(__wt_verbose(session, WT_VERB_LSM,
				    "Compact flush retry %s chunk %u",
				    name, chunk->id));
				WT_ERR(__wt_lsm_manager_push_entry(session,
				    WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE,
				    lsm_tree));
			}
		}

		/*
		 * The compacting flag is cleared when no merges can be done.
		 * Ensure that we push through some aggressive merges before
		 * stopping otherwise we might not do merges that would
		 * span chunks with different generations.
		 */
		if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) {
			if (lsm_tree->merge_aggressiveness < 10 ||
			    (progress < lsm_tree->merge_progressing) ||
			    lsm_tree->merge_syncing) {
				progress = lsm_tree->merge_progressing;
				F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
				lsm_tree->merge_aggressiveness = 10;
			} else
				break;
		}
		__wt_sleep(1, 0);
		WT_ERR(__wt_seconds(session, &end));
		if (session->compact->max_time > 0 &&
		    session->compact->max_time < (uint64_t)(end - begin)) {
			WT_ERR(ETIMEDOUT);
		}
		/*
		 * Push merge operations while they are still getting work
		 * done. If we are pushing merges, make sure they are
		 * aggressive, to avoid duplicating effort.
		 */
		if (compacting)
#define	COMPACT_PARALLEL_MERGES	5
			for (i = lsm_tree->queue_ref;
			    i < COMPACT_PARALLEL_MERGES; i++) {
				lsm_tree->merge_aggressiveness = 10;
				WT_ERR(__wt_lsm_manager_push_entry(
				    session, WT_LSM_WORK_MERGE, 0, lsm_tree));
			}
	}
err:
	/* Ensure anything we set is cleared. */
	if (ref)
		(void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
	if (compacting) {
		F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING);
		lsm_tree->merge_aggressiveness = 0;
	}
	if (locked)
		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));

	WT_TRET(__wt_verbose(session, WT_VERB_LSM,
	    "Compact %s complete, return %d", name, ret));

	__wt_lsm_tree_release(session, lsm_tree);
	return (ret);

}