예제 #1
0
/*
 * __split_stash_add --
 *	Add a new entry into the session's split stash list.
 */
static int
__split_stash_add(WT_SESSION_IMPL *session, void *p, size_t len)
{
	WT_SPLIT_STASH *stash;

	WT_ASSERT(session, p != NULL);

	/* Grow the list as necessary. */
	WT_RET(__wt_realloc_def(session, &session->split_stash_alloc,
	    session->split_stash_cnt + 1, &session->split_stash));

	stash = session->split_stash + session->split_stash_cnt++;
	stash->split_gen = WT_ATOMIC_ADD(S2C(session)->split_gen, 1);
	stash->p = p;
	stash->len = len;

	WT_STAT_FAST_CONN_ATOMIC_INCRV(session, rec_split_stashed_bytes, len);
	WT_STAT_FAST_CONN_ATOMIC_INCR(session, rec_split_stashed_objects);

	/* See if we can free any previous entries. */
	if (session->split_stash_cnt > 1)
		__wt_split_stash_discard(session);

	return (0);
}
예제 #2
0
/*
 * __wt_lsm_tree_switch --
 *	Switch to a new in-memory tree.
 */
int
__wt_lsm_tree_switch(
    WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	uint32_t new_id;

	new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); 
	WT_VERBOSE_RET(session, lsm, "Tree switch to: %d", new_id);

	if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) >
	    lsm_tree->chunk_alloc)
		WT_ERR(__wt_realloc(session,
		    &lsm_tree->chunk_alloc,
		    WT_MAX(10 * sizeof(*lsm_tree->chunk),
		    2 * lsm_tree->chunk_alloc),
		    &lsm_tree->chunk));

	WT_ERR(__wt_calloc_def(session, 1, &chunk));
	chunk->id = new_id;
	lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));

	++lsm_tree->dsk_gen;
	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));

err:	/* TODO: mark lsm_tree bad on error(?) */
	return (ret);
}
예제 #3
0
/*
 * __wt_lsm_tree_switch --
 *	Switch to a new in-memory tree.
 */
int
__wt_lsm_tree_switch(
    WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk, **cp;
	uint32_t in_memory, new_id;

	new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); 

	if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) >
	    lsm_tree->chunk_alloc)
		WT_ERR(__wt_realloc(session,
		    &lsm_tree->chunk_alloc,
		    WT_MAX(10 * sizeof(*lsm_tree->chunk),
		    2 * lsm_tree->chunk_alloc),
		    &lsm_tree->chunk));

	/*
	 * In the steady state, we expect that the checkpoint worker thread
	 * will keep up with inserts.  If not, we throttle the insert rate to
	 * avoid filling the cache with in-memory chunks.  Threads sleep every
	 * 100 operations, so take that into account in the calculation.
	 */
	for (in_memory = 1, cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
	    in_memory < lsm_tree->nchunks && !F_ISSET(*cp, WT_LSM_CHUNK_ONDISK);
	    ++in_memory, --cp)
		;
	if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 2)
		lsm_tree->throttle_sleep = 0;
	else if (in_memory == lsm_tree->nchunks ||
	    F_ISSET(*cp, WT_LSM_CHUNK_STABLE)) {
		/*
		 * No checkpoint has completed this run.  Keep slowing down
		 * inserts until one does.
		 */
		lsm_tree->throttle_sleep =
		    WT_MAX(20, 2 * lsm_tree->throttle_sleep);
	} else {
		chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];
		lsm_tree->throttle_sleep = (long)((in_memory - 2) *
		    WT_TIMEDIFF(chunk->create_ts, (*cp)->create_ts) /
		    (20 * in_memory * chunk->count));
	}

	WT_VERBOSE_ERR(session, lsm, "Tree switch to: %d, throttle %d",
	    new_id, (int)lsm_tree->throttle_sleep);

	WT_ERR(__wt_calloc_def(session, 1, &chunk));
	chunk->id = new_id;
	lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));

	++lsm_tree->dsk_gen;
	F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));

err:	/* TODO: mark lsm_tree bad on error(?) */
	return (ret);
}
예제 #4
0
/*
 * __wt_lsm_tree_truncate --
 *	Truncate an LSM tree.
 */
int
__wt_lsm_tree_truncate(
    WT_SESSION_IMPL *session, const char *name, const char *cfg[])
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	int locked;

	WT_UNUSED(cfg);
	chunk = NULL;
	locked = 0;

	/* Get the LSM tree. */
	WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree));

	/* Shut down the LSM worker. */
	WT_RET(__lsm_tree_close(session, lsm_tree));

	/* Prevent any new opens. */
	WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 1));
	locked = 1;

	/* Create the new chunk. */
	WT_ERR(__wt_calloc_def(session, 1, &chunk));
	chunk->id = WT_ATOMIC_ADD(lsm_tree->last, 1);
	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));

	/* Mark all chunks old. */
	WT_ERR(__wt_lsm_merge_update_tree(
	    session, lsm_tree, 0, lsm_tree->nchunks, chunk));

	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));

	WT_ERR(__lsm_tree_start_worker(session, lsm_tree));
	locked = 0;
	WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree));
	__wt_lsm_tree_release(session, lsm_tree);

err:	if (locked) 
		WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));
	if (ret != 0) {
		if (chunk != NULL) {
			(void)__wt_schema_drop(session, chunk->uri, NULL);
			__wt_free(session, chunk);
		}
		/*
		 * Discard the LSM tree structure on error. This will force the
		 * LSM tree to be re-opened the next time it is accessed and
		 * the last good version of the metadata will be used, resulting
		 * in a valid (not truncated) tree.
		 */
		WT_TRET(__lsm_tree_discard(session, lsm_tree));
	}
	return (ret);
}
예제 #5
0
/*
 * __wt_cond_wait --
 *	Wait on a mutex, optionally timing out.
 */
int
__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, long usecs)
{
	struct timespec ts;
	WT_DECL_RET;
	int locked;

	locked = 0;
	WT_ASSERT(session, usecs >= 0);

	/* Fast path if already signalled. */
	if (WT_ATOMIC_ADD(cond->waiters, 1) == 0)
		return (0);

	/*
	 * !!!
	 * This function MUST handle a NULL session handle.
	 */
	if (session != NULL) {
		WT_VERBOSE_RET(
		    session, mutex, "wait %s cond (%p)", cond->name, cond);
		WT_STAT_FAST_CONN_INCR(session, cond_wait);
	}

	WT_ERR(pthread_mutex_lock(&cond->mtx));
	locked = 1;

	if (usecs > 0) {
		WT_ERR(__wt_epoch(session, &ts));
		ts.tv_sec += (ts.tv_nsec + 1000 * usecs) / WT_BILLION;
		ts.tv_nsec = (ts.tv_nsec + 1000 * usecs) % WT_BILLION;
		ret = pthread_cond_timedwait(&cond->cond, &cond->mtx, &ts);
	} else
		ret = pthread_cond_wait(&cond->cond, &cond->mtx);

	/*
	 * Check pthread_cond_wait() return for EINTR, ETIME and
	 * ETIMEDOUT, some systems return these errors.
	 */
	if (ret == EINTR ||
#ifdef ETIME
	    ret == ETIME ||
#endif
	    ret == ETIMEDOUT)
		ret = 0;

	(void)WT_ATOMIC_SUB(cond->waiters, 1);

err:	if (locked)
		WT_TRET(pthread_mutex_unlock(&cond->mtx));
	if (ret == 0)
		return (0);
	WT_RET_MSG(session, ret, "pthread_cond_wait");
}
예제 #6
0
/*
 * __wt_lsm_tree_truncate --
 *	Truncate an LSM tree.
 */
int
__wt_lsm_tree_truncate(
    WT_SESSION_IMPL *session, const char *name, const char *cfg[])
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	int locked;

	WT_UNUSED(cfg);
	locked = 0;

	/* Get the LSM tree. */
	WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree));

	/* Shut down the LSM worker. */
	WT_RET(__lsm_tree_close(session, lsm_tree));

	/* Prevent any new opens. */
	WT_RET(__wt_try_writelock(session, lsm_tree->rwlock));
	locked = 1;

	/* Create the new chunk. */
	WT_ERR(__wt_calloc_def(session, 1, &chunk));
	chunk->id = WT_ATOMIC_ADD(lsm_tree->last, 1);
	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));

	/* Mark all chunks old. */
	WT_ERR(__wt_lsm_merge_update_tree(
	    session, lsm_tree, 0, lsm_tree->nchunks, chunk));

	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));

	WT_ERR(__lsm_tree_start_worker(session, lsm_tree));
	ret = __wt_rwunlock(session, lsm_tree->rwlock);
	locked = 0;
	if (ret == 0)
		__wt_lsm_tree_release(session, lsm_tree);

err:	if (locked) 
		WT_TRET(__wt_rwunlock(session, lsm_tree->rwlock));
	/*
	 * Don't discard the LSM tree structure unless there has been an
	 * error. The handle remains valid for future operations.
	 */
	if (ret != 0)
		WT_TRET(__lsm_tree_discard(session, lsm_tree));
	return (ret);
}
예제 #7
0
/*
 * __wt_lsm_tree_switch --
 *	Switch to a new in-memory tree.
 */
int
__wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	uint32_t nchunks, new_id;

	WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 1));

	/*
	 * Check if a switch is still needed: we may have raced while waiting
	 * for a lock.
	 */
	if ((nchunks = lsm_tree->nchunks) != 0 &&
	    (chunk = lsm_tree->chunk[nchunks - 1]) != NULL &&
	    !F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK) &&
	    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
		goto err;

	/* Update the throttle time. */
	__wt_lsm_tree_throttle(session, lsm_tree);

	new_id = WT_ATOMIC_ADD(lsm_tree->last, 1);

	WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc,
	    nchunks + 1, &lsm_tree->chunk));

	WT_VERBOSE_ERR(session, lsm,
	    "Tree switch to: %" PRIu32 ", throttle %ld",
	    new_id, lsm_tree->throttle_sleep);

	WT_ERR(__wt_calloc_def(session, 1, &chunk));
	chunk->id = new_id;
	chunk->txnid_max = WT_TXN_NONE;
	lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));

	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
	F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
	++lsm_tree->dsk_gen;

	lsm_tree->modified = 1;

err:	/* TODO: mark lsm_tree bad on error(?) */
	WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));
	return (ret);
}
예제 #8
0
/*
 * __lsm_copy_chunks --
 *	 Take a copy of part of the LSM tree chunk array so that we can work on
 *	 the contents without holding the LSM tree handle lock long term.
 */
static int
__lsm_copy_chunks(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_WORKER_COOKIE *cookie, int old_chunks)
{
	WT_DECL_RET;
	u_int i, nchunks;
	size_t alloc;

	/* Always return zero chunks on error. */
	cookie->nchunks = 0;

	WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0));
	if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING))
		return (__wt_lsm_tree_unlock(session, lsm_tree));

	/* Take a copy of the current state of the LSM tree. */
	nchunks = old_chunks ? lsm_tree->nold_chunks : lsm_tree->nchunks;
	alloc = old_chunks ? lsm_tree->old_alloc : lsm_tree->chunk_alloc;

	/*
	 * If the tree array of active chunks is larger than our current buffer,
	 * increase the size of our current buffer to match.
	 */
	if (cookie->chunk_alloc < alloc)
		WT_ERR(__wt_realloc(session,
		    &cookie->chunk_alloc, alloc, &cookie->chunk_array));
	if (nchunks > 0)
		memcpy(cookie->chunk_array,
		    old_chunks ? lsm_tree->old_chunks : lsm_tree->chunk,
		    nchunks * sizeof(*cookie->chunk_array));

	/*
	 * Mark each chunk as active, so we don't drop it until after we know
	 * it's safe.
	 */
	for (i = 0; i < nchunks; i++)
		(void)WT_ATOMIC_ADD(cookie->chunk_array[i]->refcnt, 1);

err:	WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));

	if (ret == 0)
		cookie->nchunks = nchunks;
	return (ret);
}
예제 #9
0
/*
 * __wt_lsm_tree_get --
 *	get an LSM tree structure for the given name.
 */
int
__wt_lsm_tree_get(WT_SESSION_IMPL *session,
    const char *uri, int exclusive, WT_LSM_TREE **treep)
{
	WT_LSM_TREE *lsm_tree;

	/* See if the tree is already open. */
	TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
		if (strcmp(uri, lsm_tree->name) == 0) {
			if (exclusive && lsm_tree->refcnt)
				return (EBUSY);

			(void)WT_ATOMIC_ADD(lsm_tree->refcnt, 1);
			*treep = lsm_tree;
			return (0);
		}

	/* Open a new tree. */
	return (__lsm_tree_open(session, uri, treep));
}
예제 #10
0
/*
 * __wt_lsm_tree_get --
 *	get an LSM tree structure for the given name.
 */
int
__wt_lsm_tree_get(WT_SESSION_IMPL *session,
    const char *uri, int exclusive, WT_LSM_TREE **treep)
{
	WT_LSM_TREE *lsm_tree;

	TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
		if (strcmp(uri, lsm_tree->name) == 0) {
			if (exclusive && lsm_tree->refcnt)
				return (EBUSY);

			(void)WT_ATOMIC_ADD(lsm_tree->refcnt, 1);
			*treep = lsm_tree;
			return (0);
		}

	/*
	 * If we don't already hold the schema lock, get it now so that we
	 * can find and/or open the handle.
	 */
	return (__lsm_tree_open(session, uri, treep));
}
예제 #11
0
/*
 * __wt_page_alloc --
 *	Create or read a page into the cache.
 */
int
__wt_page_alloc(WT_SESSION_IMPL *session,
    uint8_t type, uint32_t alloc_entries, WT_PAGE **pagep)
{
	WT_CACHE *cache;
	WT_PAGE *page;
	size_t size;
	void *p;

	*pagep = NULL;

	cache = S2C(session)->cache;

	/*
	 * Allocate a page, and for most page types, the additional information
	 * it needs to describe the disk image.
	 */
	size = sizeof(WT_PAGE);
	switch (type) {
	case WT_PAGE_COL_FIX:
		break;
	case WT_PAGE_COL_INT:
	case WT_PAGE_ROW_INT:
		size += alloc_entries * sizeof(WT_REF);
		break;
	case WT_PAGE_COL_VAR:
		size += alloc_entries * sizeof(WT_COL);
		break;
	case WT_PAGE_ROW_LEAF:
		size += alloc_entries * sizeof(WT_ROW);
		break;
	WT_ILLEGAL_VALUE(session);
	}

	WT_RET(__wt_calloc(session, 1, size, &page));
	p = (uint8_t *)page + sizeof(WT_PAGE);

	switch (type) {
	case WT_PAGE_COL_FIX:
		break;
	case WT_PAGE_COL_INT:
	case WT_PAGE_ROW_INT:
		page->u.intl.t = p;
		break;
	case WT_PAGE_COL_VAR:
		page->u.col_var.d = p;
		break;
	case WT_PAGE_ROW_LEAF:
		page->u.row.d = p;
		break;
	WT_ILLEGAL_VALUE(session);
	}

	/* Increment the cache statistics. */
	__wt_cache_page_inmem_incr(session, page, size);
	(void)WT_ATOMIC_ADD(cache->pages_inmem, 1);

	/* The one page field we set is the type. */
	page->type = type;

	*pagep = page;
	return (0);
}
예제 #12
0
파일: txn.c 프로젝트: niumowm/wiredtiger
/*
 * __wt_txn_begin --
 *	Begin a transaction.
 */
int
__wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_STATE *s, *txn_state;
	wt_txnid_t id, oldest_snap_min;
	uint32_t i, n, session_cnt;

	conn = S2C(session);
	txn = &session->txn;
	txn_global = &conn->txn_global;
	txn_state = &txn_global->states[session->id];

	WT_ASSERT(session, txn_state->id == WT_TXN_NONE);

	WT_RET(__wt_config_gets_defno(session, cfg, "isolation", &cval));
	if (cval.len == 0)
		txn->isolation = session->isolation;
	else
		txn->isolation =
		    WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
		    TXN_ISO_SNAPSHOT :
		    WT_STRING_MATCH("read-committed", cval.str, cval.len) ?
		    TXN_ISO_READ_COMMITTED : TXN_ISO_READ_UNCOMMITTED;

	F_SET(txn, TXN_RUNNING);

	do {
		/*
		 * Allocate a transaction ID.
		 *
		 * We use an atomic increment to ensure that we get a unique
		 * ID, then publish that to the global state table.
		 *
		 * If two threads race to allocate an ID, only the latest ID
		 * will proceed.  The winning thread can be sure its snapshot
		 * contains all of the earlier active IDs.  Threads that race
		 * and get an earlier ID may not appear in the snapshot,
		 * but they will loop and allocate a new ID before proceeding
		 * to make any updates.
		 *
		 * This potentially wastes transaction IDs when threads race to
		 * begin transactions, but that is the price we pay to keep
		 * this path latch free.
		 */
		do {
			txn->id = WT_ATOMIC_ADD(txn_global->current, 1);
		} while (txn->id == WT_TXN_NONE || txn->id == WT_TXN_ABORTED);
		WT_PUBLISH(txn_state->id, txn->id);

		/*
		 * If we are starting a snapshot isolation transaction, get
		 * a snapshot of the running transactions.
		 *
		 * If we already have a snapshot (e.g., for an auto-commit
		 * operation), update it so that the newly-allocated ID is
		 * visible.
		 */
		if (txn->isolation == TXN_ISO_SNAPSHOT) {
			txn->last_gen = txn->last_oldest_gen = txn_global->gen;
			oldest_snap_min = txn->id;

			/* Copy the array of concurrent transactions. */
			WT_ORDERED_READ(session_cnt, conn->session_cnt);
			for (i = n = 0, s = txn_global->states;
			    i < session_cnt;
			    i++, s++) {
				if ((id = s->snap_min) != WT_TXN_NONE)
					if (TXNID_LT(id, oldest_snap_min))
						oldest_snap_min = id;
				if ((id = s->id) == WT_TXN_NONE)
					continue;
				else
					txn->snapshot[n++] = id;
			}

			__txn_sort_snapshot(
			    session, n, txn->id, oldest_snap_min);
			txn_state->snap_min = txn->snap_min;
		}

		/*
		 * Ensure the snapshot reads are complete before re-checking
		 * the global current ID.
		 */
		WT_READ_BARRIER();
	} while (txn->id != txn_global->current);

	return (0);
}