Exemplo n.º 1
0
Arquivo: mutex.c Projeto: 3rf/mongo
/*
 * __spin_lock_next_id --
 *	Return the next spinlock caller ID.
 */
static int
__spin_lock_next_id(WT_SESSION_IMPL *session, int *idp)
{
	static int lock_id = 0, next_id = 0;
	WT_DECL_RET;

	/* If we've ever registered this location, we already have an ID. */
	if (*idp != WT_SPINLOCK_REGISTER)
		return (0);

	/*
	 * We can't use the global spinlock to lock the ID allocation (duh!),
	 * use a CAS instruction to serialize access to a local variable.
	 * This work only gets done once per library instantiation, there
	 * isn't a performance concern.
	 */
	while (!WT_ATOMIC_CAS(lock_id, 0, 1))
		__wt_yield();

	/* Allocate a blocking ID for this location. */
	if (*idp == WT_SPINLOCK_REGISTER) {
		if (next_id < WT_SPINLOCK_MAX_LOCATION_ID)
			*idp = next_id++;
		else
			WT_ERR_MSG(session, ENOMEM,
			    "spinlock caller location registry failed, "
			    "increase the connection's blocking matrix size");
	}

err:	WT_PUBLISH(lock_id, 0);
	return (ret);
}
Exemplo n.º 2
0
/*
 * __wt_schema_get_table_uri --
 *	Get the table handle for the named table.
 */
int
__wt_schema_get_table_uri(WT_SESSION_IMPL *session,
    const char *uri, bool ok_incomplete, uint32_t flags, WT_TABLE **tablep)
{
	WT_DATA_HANDLE *saved_dhandle;
	WT_DECL_RET;
	WT_TABLE *table;

	*tablep = NULL;

	saved_dhandle = session->dhandle;

	WT_ERR(__wt_session_get_dhandle(session, uri, NULL, NULL, flags));
	table = (WT_TABLE *)session->dhandle;
	if (!ok_incomplete && !table->cg_complete) {
		WT_ERR(__wt_session_release_dhandle(session));
		ret = __wt_set_return(session, EINVAL);
		WT_ERR_MSG(session, ret, "'%s' cannot be used "
		    "until all column groups are created",
		    table->iface.name);
	}
	*tablep = table;

err:	session->dhandle = saved_dhandle;
	return (ret);
}
Exemplo n.º 3
0
/*
 * __curfile_equals --
 *	WT_CURSOR->equals method for the btree cursor type.
 */
static int
__curfile_equals(WT_CURSOR *a, WT_CURSOR *b, int *equalp)
{
	WT_CURSOR_BTREE *cbt;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	cbt = (WT_CURSOR_BTREE *)a;
	CURSOR_API_CALL(a, session, equals, cbt->btree);

	/*
	 * Check both cursors are a "file:" type then call the underlying
	 * function, it can handle cursors pointing to different objects.
	 */
	if (!WT_PREFIX_MATCH(a->internal_uri, "file:") ||
	    !WT_PREFIX_MATCH(b->internal_uri, "file:"))
		WT_ERR_MSG(session, EINVAL,
		    "Cursors must reference the same object");

	WT_CURSOR_CHECKKEY(a);
	WT_CURSOR_CHECKKEY(b);

	ret = __wt_btcur_equals(
	    (WT_CURSOR_BTREE *)a, (WT_CURSOR_BTREE *)b, equalp);

err:	API_END_RET(session, ret);
}
Exemplo n.º 4
0
/*
 * __wt_dlopen --
 *	Open a dynamic library.
 */
int
__wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
{
    WT_DECL_RET;
    WT_DLH *dlh;

    WT_RET(__wt_calloc_one(session, &dlh));
    WT_ERR(__wt_strdup(session, path, &dlh->name));

    /* NULL means load from the current binary */
    if (path == NULL) {
        ret = GetModuleHandleExA(0, NULL, (HMODULE *)&dlh->handle);
        if (ret == FALSE)
            WT_ERR_MSG(session,
                       __wt_errno(), "GetModuleHandleEx(%s): %s", path, 0);
    } else {
        // TODO: load dll here
        DebugBreak();
    }

    /* Windows returns 0 on failure, WT expects 0 on success */
    ret = !ret;

    *dlhp = dlh;
    if (0) {
err:
        __wt_free(session, dlh->name);
        __wt_free(session, dlh);
    }
    return (ret);
}
Exemplo n.º 5
0
/*创建一个connection evict cache*/
int __wt_cache_create(WT_SESSION_IMPL* session, const char* cfg[])
{
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;

	conn = S2C(session);

	WT_RET(__wt_calloc_one(session, &conn->cache));
	cache = conn->cache;

	/*对cache进行配置*/
	WT_RET(__wt_cache_config(session, 0, cfg));

	if (cache->eviction_target >= cache->eviction_trigger)
		WT_ERR_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger");

	/*创建evict cond信号量*/
	WT_ERR(__wt_cond_alloc(session, "cache eviction server", 0, &cache->evict_cond));
	WT_ERR(__wt_cond_alloc(session, "eviction waiters", 0, &cache->evict_waiter_cond));
	WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction"));
	WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk"));

	/* Allocate the LRU eviction queue. */
	cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
	WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict));

	/*初始化cache stat统计模块*/
	__wt_cache_stats_update(session);
	return 0;

err:
	WT_RET(__wt_cache_destroy(session));
	return ret;
}
Exemplo n.º 6
0
/*
 * __bulk_row_keycmp_err --
 *	Error routine when row-store keys inserted out-of-order.
 */
static int
__bulk_row_keycmp_err(WT_CURSOR_BULK *cbulk)
{
	WT_CURSOR *cursor;
	WT_DECL_ITEM(a);
	WT_DECL_ITEM(b);
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session;
	cursor = &cbulk->cbt.iface;

	WT_ERR(__wt_scr_alloc(session, 512, &a));
	WT_ERR(__wt_scr_alloc(session, 512, &b));

	WT_ERR_MSG(session, EINVAL,
	    "bulk-load presented with out-of-order keys: %s compares smaller "
	    "than previously inserted key %s",
	    __wt_buf_set_printable(
	    session, cursor->key.data, cursor->key.size, a),
	    __wt_buf_set_printable(
	    session, cbulk->last.data, cbulk->last.size, b));

err:	__wt_scr_free(session, &a);
	__wt_scr_free(session, &b);
	return (ret);
}
Exemplo n.º 7
0
/*
 * __session_open_cursor --
 *	WT_SESSION->open_cursor method.
 */
static int
__session_open_cursor(WT_SESSION *wt_session,
    const char *uri, WT_CURSOR *to_dup, const char *config, WT_CURSOR **cursorp)
{
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	session = (WT_SESSION_IMPL *)wt_session;
	SESSION_API_CALL(session, open_cursor, config, cfg);

	if ((to_dup == NULL && uri == NULL) || (to_dup != NULL && uri != NULL))
		WT_ERR_MSG(session, EINVAL,
		    "should be passed either a URI or a cursor to duplicate, "
		    "but not both");

	if (to_dup != NULL) {
		uri = to_dup->uri;
		if (WT_PREFIX_MATCH(uri, "colgroup:") ||
		    WT_PREFIX_MATCH(uri, "index:") ||
		    WT_PREFIX_MATCH(uri, "file:") ||
		    WT_PREFIX_MATCH(uri, "lsm:") ||
		    WT_PREFIX_MATCH(uri, "table:"))
			ret = __wt_cursor_dup(session, to_dup, cfg, cursorp);
		else
			ret = __wt_bad_object_type(session, uri);
	} else
		ret = __wt_open_cursor(session, uri, NULL, cfg, cursorp);

err:	API_END_NOTFOUND_MAP(session, ret);
}
Exemplo n.º 8
0
/*
 * __curfile_compare --
 *	WT_CURSOR->compare method for the btree cursor type.
 */
static int
__curfile_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
{
	WT_CURSOR_BTREE *cbt;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	cbt = (WT_CURSOR_BTREE *)a;
	CURSOR_API_CALL(a, session, compare, cbt->btree);

	/*
	 * Confirm both cursors refer to the same source and have keys, then
	 * call the underlying object to compare them.
	 */
	if (strcmp(a->uri, b->uri) != 0)
		WT_ERR_MSG(session, EINVAL,
		    "comparison method cursors must reference the same object");

	WT_CURSOR_NEEDKEY(a);
	WT_CURSOR_NEEDKEY(b);

	ret = __wt_btcur_compare(
	    (WT_CURSOR_BTREE *)a, (WT_CURSOR_BTREE *)b, cmpp);
err:	API_END(session);

	return (ret);
}
Exemplo n.º 9
0
/*
 * __curfile_modify --
 *	WT_CURSOR->modify method for the btree cursor type.
 */
static int
__curfile_modify(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries)
{
	WT_CURSOR_BTREE *cbt;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	cbt = (WT_CURSOR_BTREE *)cursor;
	CURSOR_UPDATE_API_CALL_BTREE(cursor, session, modify, cbt->btree);
	WT_ERR(__cursor_checkkey(cursor));

	/* Check for a rational modify vector count. */
	if (nentries <= 0)
		WT_ERR_MSG(session, EINVAL,
		    "Illegal modify vector with %d entries", nentries);

	WT_ERR(__wt_btcur_modify(cbt, entries, nentries));

	/*
	 * Modify maintains a position, key and value. Unlike update, it's not
	 * always an internal value.
	 */
	WT_ASSERT(session,
	    F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);
	WT_ASSERT(session, F_MASK(cursor, WT_CURSTD_VALUE_SET) != 0);

err:	CURSOR_UPDATE_API_END(session, ret);
	return (ret);
}
Exemplo n.º 10
0
/*
 * __curjoin_get_value --
 *	WT_CURSOR->get_value for join cursors.
 */
static int
__curjoin_get_value(WT_CURSOR *cursor, ...)
{
	WT_CURSOR_JOIN *cjoin;
	WT_CURSOR_JOIN_ITER *iter;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	va_list ap;

	cjoin = (WT_CURSOR_JOIN *)cursor;
	iter = cjoin->iter;

	va_start(ap, cursor);
	CURSOR_API_CALL(cursor, session, get_value, NULL);

	if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED) ||
	    !__curjoin_entry_iter_ready(iter))
		WT_ERR_MSG(session, EINVAL,
		    "join cursor must be advanced with next()");
	if (iter->entry->index != NULL)
		WT_ERR(__wt_curindex_get_valuev(iter->cursor, ap));
	else
		WT_ERR(__wt_curtable_get_valuev(iter->cursor, ap));

err:	va_end(ap);
	API_END_RET(session, ret);
}
Exemplo n.º 11
0
/*
 * __clsm_compare --
 *	WT_CURSOR->compare implementation for the LSM cursor type.
 */
static int
__clsm_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
{
	WT_CURSOR_LSM *alsm;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	int cmp;

	/* There's no need to sync with the LSM tree, avoid WT_LSM_ENTER. */
	alsm = (WT_CURSOR_LSM *)a;
	CURSOR_API_CALL(a, session, compare, NULL);

	/*
	 * Confirm both cursors refer to the same source and have keys, then
	 * compare the keys.
	 */
	if (strcmp(a->uri, b->uri) != 0)
		WT_ERR_MSG(session, EINVAL,
		    "comparison method cursors must reference the same object");

	WT_CURSOR_NEEDKEY(a);
	WT_CURSOR_NEEDKEY(b);

	WT_ERR(WT_LEX_CMP(
	    session, alsm->lsm_tree->collator, &a->key, &b->key, cmp));
	*cmpp = cmp;

err:	API_END(session);
	return (ret);
}
Exemplo n.º 12
0
/*
 * __backup_uri --
 *	Backup a list of objects.
 */
static int
__backup_uri(WT_SESSION_IMPL *session,
             WT_CURSOR_BACKUP *cb, const char *cfg[], int *foundp, int *log_only)
{
    WT_CONFIG targetconf;
    WT_CONFIG_ITEM cval, k, v;
    WT_DECL_ITEM(tmp);
    WT_DECL_RET;
    int target_list;
    const char *uri;

    *foundp = 0;
    *log_only = 0;

    /*
     * If we find a non-empty target configuration string, we have a job,
     * otherwise it's not our problem.
     */
    WT_RET(__wt_config_gets(session, cfg, "target", &cval));
    WT_RET(__wt_config_subinit(session, &targetconf, &cval));
    for (cb->list_next = 0, target_list = 0;
            (ret = __wt_config_next(&targetconf, &k, &v)) == 0; ++target_list) {
        /* If it is our first time through, allocate. */
        if (target_list == 0) {
            *foundp = 1;
            WT_ERR(__wt_scr_alloc(session, 512, &tmp));
        }

        WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
        uri = tmp->data;
        if (v.len != 0)
            WT_ERR_MSG(session, EINVAL,
                       "%s: invalid backup target: URIs may need quoting",
                       uri);

        /*
         * Handle log targets.  We do not need to go through the
         * schema worker, just call the function to append them.
         * Set log_only only if it is our only URI target.
         */
        if (WT_PREFIX_MATCH(uri, "log:")) {
            if (target_list == 0)
                *log_only = 1;
            else
                *log_only = 0;
            WT_ERR(__wt_backup_list_uri_append(
                       session, uri, NULL));
        } else
            WT_ERR(__wt_schema_worker(session,
                                      uri, NULL, __wt_backup_list_uri_append, cfg, 0));
    }
    WT_ERR_NOTFOUND_OK(ret);

err:
    __wt_scr_free(session, &tmp);
    return (ret);
}
Exemplo n.º 13
0
/*
 * __wt_config_concat --
 *	Given a NULL-terminated list of configuration strings, concatenate them
 *	into a newly allocated buffer.  Nothing special is assumed about any
 *	of the config strings, they are simply combined in order.
 *
 *	This code deals with the case where some of the config strings are
 *	wrapped in brackets but others aren't: the resulting string does not
 *	have brackets.
 */
int
__wt_config_concat(
    WT_SESSION_IMPL *session, const char **cfg, const char **config_ret)
{
	WT_CONFIG cparser;
	WT_CONFIG_ITEM k, v;
	WT_ITEM buf;
	int ret;
	const char **cp;

	WT_CLEAR(buf);
	ret = 0;

	for (cp = cfg; *cp != NULL; ++cp) {
		WT_ERR(__wt_config_init(session, &cparser, *cp));
		while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
			if (k.type != ITEM_STRING && k.type != ITEM_ID)
				WT_ERR_MSG(session, EINVAL,
				    "Invalid configuration key found: '%s'\n",
				    k.str);
			/* Include the quotes around string keys/values. */
			if (k.type == ITEM_STRING) {
				--k.str;
				k.len += 2;
			}
			if (v.type == ITEM_STRING) {
				--v.str;
				v.len += 2;
			}
			WT_ERR(__wt_buf_catfmt(session, &buf, "%.*s%s%.*s,",
			    (int)k.len, k.str,
			    (v.len > 0) ? "=" : "",
			    (int)v.len, v.str));
		}
		if (ret != WT_NOTFOUND)
			goto err;
	}

	/*
	 * If the caller passes us no valid configuration strings, we end up
	 * here with no allocated memory to return.  Check the final buffer
	 * size: empty configuration strings are possible, and paranoia is
	 * good.
	 */
	if (buf.size == 0)
		WT_RET(__wt_buf_initsize(session, &buf, 1));

	/* Strip the trailing comma and NUL-terminate */
	((char *)buf.data)[buf.size - 1] = '\0';

	*config_ret = buf.data;
	return (0);

err:	__wt_buf_free(session, &buf);
	return (ret);
}
Exemplo n.º 14
0
/*
 * __wt_cache_create --
 *	Create the underlying cache.
 */
int
__wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;

	conn = S2C(session);

	WT_ASSERT(session, conn->cache == NULL);

	WT_RET(__wt_calloc_one(session, &conn->cache));

	cache = conn->cache;

	/* Use a common routine for run-time configuration options. */
	WT_RET(__wt_cache_config(session, false, cfg));

	/*
	 * The lowest possible page read-generation has a special meaning, it
	 * marks a page for forcible eviction; don't let it happen by accident.
	 */
	cache->read_gen = WT_READGEN_START_VALUE;

	/*
	 * The target size must be lower than the trigger size or we will never
	 * get any work done.
	 */
	if (cache->eviction_target >= cache->eviction_trigger)
		WT_ERR_MSG(session, EINVAL,
		    "eviction target must be lower than the eviction trigger");

	WT_ERR(__wt_cond_auto_alloc(session, "cache eviction server",
	    false, 10000, WT_MILLION, &cache->evict_cond));
	WT_ERR(__wt_cond_alloc(session,
	    "eviction waiters", false, &cache->evict_waiter_cond));
	WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction"));
	WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk"));

	/* Allocate the LRU eviction queue. */
	cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
	WT_ERR(__wt_calloc_def(session,
	    cache->evict_slots, &cache->evict_queue));

	/*
	 * We get/set some values in the cache statistics (rather than have
	 * two copies), configure them.
	 */
	__wt_cache_stats_update(session);
	return (0);

err:	WT_RET(__wt_cache_destroy(session));
	return (ret);
}
Exemplo n.º 15
0
/*btree file的compact操作*/
static int __compact_file(WT_SESSION_IMPL* session, const char* uri, const char* cfg[])
{
	WT_DECL_RET;
	WT_DECL_ITEM(t);
	WT_SESSION *wt_session;
	WT_TXN *txn;
	int i;
	struct timespec start_time;

	txn = &session->txn;
	wt_session = &session->iface;

	/*
	 * File compaction requires checkpoints, which will fail in a
	 * transactional context.  Check now so the error message isn't
	 * confusing.
	 */
	if(session->compact->file_count != 0 && F_ISSET(txn, TXN_RUNNING))
		WT_ERR_MSG(session, EINVAL, " File compaction not permitted in a transaction");

	/*
	 * Force the checkpoint: we don't want to skip it because the work we
	 * need to have done is done in the underlying block manager.
	 */
	WT_ERR(__wt_scr_alloc(session, 128, &t));
	WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri));

	WT_ERR(__wt_epoch(session, &start_time));

	/*
	 * We compact 10% of the file on each pass (but the overall size of the
	 * file is decreasing each time, so we're not compacting 10% of the
	 * original file each time). Try 100 times (which is clearly more than
	 * we need); quit if we make no progress and check for a timeout each
	 * time through the loop.
	 */
	for (i = 0; i < 100; ++i) {
		WT_ERR(wt_session->checkpoint(wt_session, t->data));

		session->compaction = 0;
		WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, __wt_compact, NULL, cfg, 0));
		WT_ERR(ret);
		if (!session->compaction)
			break;

		WT_ERR(wt_session->checkpoint(wt_session, t->data));
		WT_ERR(wt_session->checkpoint(wt_session, t->data));
		WT_ERR(__session_compact_check_timeout(session, start_time));
	}

err:
	__wt_scr_free(session, &t);
}
Exemplo n.º 16
0
/*
 * __wt_config_collapse --
 *	Collapse a set of configuration strings into newly allocated memory.
 *
 * This function takes a NULL-terminated list of configuration strings (where
 * the first one contains all the defaults and the values are in order from
 * least to most preferred, that is, the default values are least preferred),
 * and collapses them into newly allocated memory.  The algorithm is to walk
 * the first of the configuration strings, and for each entry, search all of
 * the configuration strings for a final value, keeping the last value found.
 *
 * Notes:
 *	Any key not appearing in the first configuration string is discarded
 *	from the final result, because we'll never search for it.
 *
 *	Nested structures aren't parsed.  For example, imagine a configuration
 *	string contains "key=(k2=v2,k3=v3)", and a subsequent string has
 *	"key=(k4=v4)", the result will be "key=(k4=v4)", as we search for and
 *	use the final value of "key", regardless of field overlap or missing
 *	fields in the nested value.
 */
int
__wt_config_collapse(
    WT_SESSION_IMPL *session, const char **cfg, char **config_ret)
{
	WT_CONFIG cparser;
	WT_CONFIG_ITEM k, v;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;

	*config_ret = NULL;

	WT_RET(__wt_scr_alloc(session, 0, &tmp));

	__wt_config_init(session, &cparser, cfg[0]);
	while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) {
		if (k.type != WT_CONFIG_ITEM_STRING &&
		    k.type != WT_CONFIG_ITEM_ID)
			WT_ERR_MSG(session, EINVAL,
			    "Invalid configuration key found: '%s'", k.str);
		WT_ERR(__wt_config_get(session, cfg, &k, &v));
		/* Include the quotes around string keys/values. */
		if (k.type == WT_CONFIG_ITEM_STRING) {
			--k.str;
			k.len += 2;
		}
		if (v.type == WT_CONFIG_ITEM_STRING) {
			--v.str;
			v.len += 2;
		}
		WT_ERR(__wt_buf_catfmt(session, tmp, "%.*s=%.*s,",
		    (int)k.len, k.str, (int)v.len, v.str));
	}

	/* We loop until error, and the expected error is WT_NOTFOUND. */
	if (ret != WT_NOTFOUND)
		goto err;

	/*
	 * If the caller passes us no valid configuration strings, we get here
	 * with no bytes to copy -- that's OK, the underlying string copy can
	 * handle empty strings.
	 *
	 * Strip any trailing comma.
	 */
	if (tmp->size != 0)
		--tmp->size;
	ret = __wt_strndup(session, tmp->data, tmp->size, config_ret);

err:	__wt_scr_free(session, &tmp);
	return (ret);
}
Exemplo n.º 17
0
/*
 * __session_begin_transaction --
 *	WT_SESSION->begin_transaction method.
 */
static int
__session_begin_transaction(WT_SESSION *wt_session, const char *config)
{
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	session = (WT_SESSION_IMPL *)wt_session;
	SESSION_API_CALL(session, begin_transaction, config, cfg);
	WT_CSTAT_INCR(session, txn_begin);

	if (!F_ISSET(S2C(session), WT_CONN_TRANSACTIONAL))
		WT_ERR_MSG(session, EINVAL,
		    "Database not configured for transactions");
	if (F_ISSET(&session->txn, TXN_RUNNING))
		WT_ERR_MSG(session, EINVAL, "Transaction already running");

	WT_ERR(__session_reset_cursors(session));

	ret = __wt_txn_begin(session, cfg);

err:	API_END(session);
	return (ret);
}
Exemplo n.º 18
0
/*
 * __curindex_set_value --
 *	WT_CURSOR->set_value implementation for index cursors.
 */
static void
__curindex_set_value(WT_CURSOR *cursor, ...)
{
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	JOINABLE_CURSOR_API_CALL(cursor, session, set_value, NULL);
	WT_ERR_MSG(session, ENOTSUP,
	    "WT_CURSOR.set_value not supported for index cursors");

err:	cursor->saved_err = ret;
	F_CLR(cursor, WT_CURSTD_VALUE_SET);
	API_END(session, ret);
}
Exemplo n.º 19
0
/*
 * __wt_cache_create --
 *	Create the underlying cache.
 */
int
__wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;

	conn = S2C(session);

	WT_ASSERT(session, conn->cache == NULL);

	WT_RET(__wt_calloc_one(session, &conn->cache));

	cache = conn->cache;

	/* Use a common routine for run-time configuration options. */
	WT_RET(__wt_cache_config(session, 0, cfg));

	/*
	 * The target size must be lower than the trigger size or we will never
	 * get any work done.
	 */
	if (cache->eviction_target >= cache->eviction_trigger)
		WT_ERR_MSG(session, EINVAL,
		    "eviction target must be lower than the eviction trigger");

	WT_ERR(__wt_cond_alloc(session,
	    "cache eviction server", 0, &cache->evict_cond));
	WT_ERR(__wt_cond_alloc(session,
	    "eviction waiters", 0, &cache->evict_waiter_cond));
	WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction"));
	WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk"));

	/* Allocate the LRU eviction queue. */
	cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
	WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict));

	/*
	 * We get/set some values in the cache statistics (rather than have
	 * two copies), configure them.
	 */
	__wt_cache_stats_update(session);
	return (0);

err:	WT_RET(__wt_cache_destroy(session));
	return (ret);
}
Exemplo n.º 20
0
/*
 * __curjoin_next --
 *	WT_CURSOR::next for join cursors.
 */
static int
__curjoin_next(WT_CURSOR *cursor)
{
	WT_CURSOR_JOIN *cjoin;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	bool skip_left;
	u_int i;

	cjoin = (WT_CURSOR_JOIN *)cursor;

	CURSOR_API_CALL(cursor, session, next, NULL);

	if (F_ISSET(cjoin, WT_CURJOIN_ERROR))
		WT_ERR_MSG(session, WT_ERROR,
		    "join cursor encountered previous error");
	if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED))
		WT_ERR(__curjoin_init_iter(session, cjoin));

nextkey:
	if ((ret = __curjoin_entry_iter_next(cjoin->iter, &cursor->key,
	    &cursor->recno)) == 0) {
		F_SET(cursor, WT_CURSTD_KEY_EXT);

		/*
		 * We may have already established membership for the
		 * 'left' case for the first entry, since we're
		 * using that in our iteration.
		 */
		skip_left = F_ISSET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT);
		for (i = 0; i < cjoin->entries_next; i++) {
			ret = __curjoin_entry_member(session, cjoin,
			    &cjoin->entries[i], skip_left);
			if (ret == WT_NOTFOUND)
				goto nextkey;
			skip_left = false;
			WT_ERR(ret);
		}
	}

	if (0) {
err:		F_SET(cjoin, WT_CURJOIN_ERROR);
	}
	API_END_RET(session, ret);
}
Exemplo n.º 21
0
/*
 * __session_checkpoint --
 *	WT_SESSION->checkpoint method.
 */
static int
__session_checkpoint(WT_SESSION *wt_session, const char *config)
{
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	WT_TXN *txn;

	session = (WT_SESSION_IMPL *)wt_session;
	txn = &session->txn;

	WT_CSTAT_INCR(session, checkpoint);
	SESSION_API_CALL(session, checkpoint, config, cfg);

	/*
	 * Checkpoints require a snapshot to write a transactionally consistent
	 * snapshot of the data.
	 *
	 * We can't use an application's transaction: if it has uncommitted
	 * changes, they will be written in the checkpoint and may appear after
	 * a crash.
	 *
	 * Use a real snapshot transaction: we don't want any chance of the
	 * snapshot being updated during the checkpoint.  Eviction is prevented
	 * from evicting anything newer than this because we track the oldest
	 * transaction ID in the system that is not visible to all readers.
	 */
	if (F_ISSET(txn, TXN_RUNNING))
		WT_ERR_MSG(session, EINVAL,
		    "Checkpoint not permitted in a transaction");

	/*
	 * Reset open cursors.
	 *
	 * We do this here explicitly even though it will happen implicitly in
	 * the call to begin_transaction for the checkpoint, in case some
	 * implementation of WT_CURSOR::reset needs the schema lock.
	 */
	WT_ERR(__session_reset_cursors(session));

	WT_WITH_SCHEMA_LOCK(session,
	    ret = __wt_txn_checkpoint(session, cfg));

err:	API_END_NOTFOUND_MAP(session, ret);
}
Exemplo n.º 22
0
/*
 * __backup_uri --
 *	Backup a list of objects.
 */
static int
__backup_uri(WT_SESSION_IMPL *session,
    WT_CURSOR_BACKUP *cb, const char *cfg[], int *foundp)
{
	WT_CONFIG targetconf;
	WT_CONFIG_ITEM cval, k, v;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	int target_list;
	const char *uri;

	*foundp = target_list = 0;

	/*
	 * If we find a non-empty target configuration string, we have a job,
	 * otherwise it's not our problem.
	 */
	WT_RET(__wt_config_gets(session, cfg, "target", &cval));
	WT_RET(__wt_config_subinit(session, &targetconf, &cval));
	for (cb->list_next = 0;
	    (ret = __wt_config_next(&targetconf, &k, &v)) == 0;) {
		if (!target_list) {
			target_list = *foundp = 1;

			WT_ERR(__wt_scr_alloc(session, 512, &tmp));
		}

		WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
		uri = tmp->data;
		if (v.len != 0)
			WT_ERR_MSG(session, EINVAL,
			    "%s: invalid backup target: URIs may need quoting",
			    uri);

		WT_ERR(__wt_schema_worker(
		    session, uri, NULL, __wt_backup_list_uri_append, cfg, 0));
	}
	WT_ERR_NOTFOUND_OK(ret);

err:	__wt_scr_free(&tmp);
	return (ret);
}
Exemplo n.º 23
0
/*
 * __wt_dlopen --
 *	Open a dynamic library.
 */
int
__wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
{
	WT_DECL_RET;
	WT_DLH *dlh;

	WT_RET(__wt_calloc_one(session, &dlh));
	WT_ERR(__wt_strdup(session, path, &dlh->name));

	if ((dlh->handle = dlopen(path, RTLD_LAZY)) == NULL)
		WT_ERR_MSG(
		    session, __wt_errno(), "dlopen(%s): %s", path, dlerror());

	*dlhp = dlh;
	if (0) {
err:		__wt_free(session, dlh->name);
		__wt_free(session, dlh);
	}
	return (ret);
}
Exemplo n.º 24
0
/*
 * __curds_compare --
 *	WT_CURSOR.compare method for the data-source cursor type.
 */
static int
__curds_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
{
	WT_COLLATOR *collator;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	CURSOR_API_CALL(a, session, compare, NULL);

	/*
	 * Confirm both cursors refer to the same source and have keys, then
	 * compare them.
	 */
	if (strcmp(a->internal_uri, b->internal_uri) != 0)
		WT_ERR_MSG(session, EINVAL,
		    "Cursors must reference the same object");

	WT_ERR(__cursor_needkey(a));
	WT_ERR(__cursor_needkey(b));

	if (WT_CURSOR_RECNO(a)) {
		if (a->recno < b->recno)
			*cmpp = -1;
		else if (a->recno == b->recno)
			*cmpp = 0;
		else
			*cmpp = 1;
	} else {
		/*
		 * The assumption is data-sources don't provide WiredTiger with
		 * WT_CURSOR.compare methods, instead, we'll copy the key/value
		 * out of the underlying data-source cursor and any comparison
		 * to be done can be done at this level.
		 */
		collator = ((WT_CURSOR_DATA_SOURCE *)a)->collator;
		WT_ERR(__wt_compare(
		    session, collator, &a->key, &b->key, cmpp));
	}

err:	API_END_RET(session, ret);
}
Exemplo n.º 25
0
/*
 * __curjoin_get_key --
 *	WT_CURSOR->get_key for join cursors.
 */
static int
__curjoin_get_key(WT_CURSOR *cursor, ...)
{
	WT_CURSOR_JOIN *cjoin;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	va_list ap;

	cjoin = (WT_CURSOR_JOIN *)cursor;

	va_start(ap, cursor);
	JOINABLE_CURSOR_API_CALL(cursor, session, get_key, NULL);

	if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED) ||
	    !cjoin->iter->positioned)
		WT_ERR_MSG(session, EINVAL,
		    "join cursor must be advanced with next()");
	WT_ERR(__wt_cursor_get_keyv(cursor, cursor->flags, ap));

err:	va_end(ap);
	API_END_RET(session, ret);
}
Exemplo n.º 26
0
/*
 * __curmetadata_compare --
 *	WT_CURSOR->compare method for the metadata cursor type.
 */
static int
__curmetadata_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
{
	WT_CURSOR *a_file_cursor, *b_file_cursor;
	WT_CURSOR_METADATA *a_mdc, *b_mdc;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	a_mdc = ((WT_CURSOR_METADATA *)a);
	b_mdc = ((WT_CURSOR_METADATA *)b);
	a_file_cursor = a_mdc->file_cursor;
	b_file_cursor = b_mdc->file_cursor;

	CURSOR_API_CALL(a, session,
	    compare, ((WT_CURSOR_BTREE *)a_file_cursor)->btree);

	if (b->compare != __curmetadata_compare)
		WT_ERR_MSG(session, EINVAL,
		    "Can only compare cursors of the same type");

	WT_MD_CURSOR_NEEDKEY(a);
	WT_MD_CURSOR_NEEDKEY(b);

	if (F_ISSET(a_mdc, WT_MDC_ONMETADATA)) {
		if (F_ISSET(b_mdc, WT_MDC_ONMETADATA))
			*cmpp = 0;
		else
			*cmpp = 1;
	} else if (F_ISSET(b_mdc, WT_MDC_ONMETADATA))
		*cmpp = -1;
	else
		ret = a_file_cursor->compare(
		    a_file_cursor, b_file_cursor, cmpp);

err:	API_END(session, ret);
	return (ret);
}
Exemplo n.º 27
0
/*
 * __session_open_cursor --
 *	WT_SESSION->open_cursor method.
 */
static int
__session_open_cursor(WT_SESSION *wt_session,
                      const char *uri, WT_CURSOR *to_dup, const char *config, WT_CURSOR **cursorp)
{
    WT_SESSION_IMPL *session;
    int ret;

    session = (WT_SESSION_IMPL *)wt_session;
    SESSION_API_CALL(session, open_cursor, config, cfg);

    if (uri != NULL && to_dup != NULL)
        WT_ERR_MSG(session, EINVAL,
                   "should be passed either a URI or a cursor, but not both");

    if (to_dup != NULL)
        ret = __wt_cursor_dup(session, to_dup, config, cursorp);
    else if (WT_PREFIX_MATCH(uri, "colgroup:"))
        ret = __wt_curfile_open(session, uri, cfg, cursorp);
    else if (WT_PREFIX_MATCH(uri, "config:"))
        ret = __wt_curconfig_open(session, uri, cfg, cursorp);
    else if (WT_PREFIX_MATCH(uri, "file:"))
        ret = __wt_curfile_open(session, uri, cfg, cursorp);
    else if (WT_PREFIX_MATCH(uri, "index:"))
        ret = __wt_curindex_open(session, uri, cfg, cursorp);
    else if (WT_PREFIX_MATCH(uri, "statistics:"))
        ret = __wt_curstat_open(session, uri, cfg, cursorp);
    else if (WT_PREFIX_MATCH(uri, "table:"))
        ret = __wt_curtable_open(session, uri, cfg, cursorp);
    else {
        __wt_err(session, EINVAL, "Unknown cursor type '%s'", uri);
        ret = EINVAL;
    }

err:
    API_END_NOTFOUND_MAP(session, ret);
}
Exemplo n.º 28
0
Arquivo: os_fs.c Projeto: GYGit/mongo
/*
 * __posix_directory_sync --
 *	Flush a directory to ensure file creation, remove or rename is durable.
 */
static int
__posix_directory_sync(WT_SESSION_IMPL *session, const char *path)
{
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	int fd, tret;
	char *dir;

	WT_RET(__wt_scr_alloc(session, 0, &tmp));
	WT_ERR(__wt_buf_setstr(session, tmp, path));

	/*
	 * This layer should never see a path that doesn't include a trailing
	 * path separator, this code asserts that fact.
	 */
	dir = tmp->mem;
	strrchr(dir, '/')[1] = '\0';

	fd = -1;			/* -Wconditional-uninitialized */
	WT_SYSCALL_RETRY((
	    (fd = open(dir, O_RDONLY, 0444)) == -1 ? -1 : 0), ret);
	if (ret != 0)
		WT_ERR_MSG(session, ret, "%s: directory-sync: open", dir);

	ret = __posix_sync(session, fd, dir, "directory-sync");

	WT_SYSCALL(close(fd), tret);
	if (tret != 0) {
		__wt_err(session, tret, "%s: directory-sync: close", dir);
		if (ret == 0)
			ret = tret;
	}

err:	__wt_scr_free(session, &tmp);
	return (ret);
}
Exemplo n.º 29
0
/*
 * __curindex_compare --
 *	WT_CURSOR->compare method for the index cursor type.
 */
static int
__curindex_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
{
	WT_CURSOR_INDEX *cindex;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	cindex = (WT_CURSOR_INDEX *)a;
	JOINABLE_CURSOR_API_CALL(a, session, compare, NULL);

	/* Check both cursors are "index:" type. */
	if (!WT_PREFIX_MATCH(a->uri, "index:") ||
	    strcmp(a->uri, b->uri) != 0)
		WT_ERR_MSG(session, EINVAL,
		    "Cursors must reference the same object");

	WT_CURSOR_CHECKKEY(a);
	WT_CURSOR_CHECKKEY(b);

	ret = __wt_compare(
	    session, cindex->index->collator, &a->key, &b->key, cmpp);

err:	API_END_RET(session, ret);
}
Exemplo n.º 30
0
/*
 * __ckpt_process --
 *	Process the list of checkpoints.
 */
static int
__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
{
	WT_BLOCK_CKPT *a, *b, *ci;
	WT_CKPT *ckpt, *next_ckpt;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint64_t ckpt_size;
	bool deleting, fatal, locked;

	ci = &block->live;
	fatal = locked = false;

#ifdef HAVE_DIAGNOSTIC
	WT_RET(__ckpt_verify(session, ckptbase));
#endif

	/*
	 * Checkpoints are a two-step process: first, write a new checkpoint to
	 * disk (including all the new extent lists for modified checkpoints
	 * and the live system).  As part of this, create a list of file blocks
	 * newly available for reallocation, based on checkpoints being deleted.
	 * We then return the locations of the new checkpoint information to our
	 * caller.  Our caller has to write that information into some kind of
	 * stable storage, and once that's done, we can actually allocate from
	 * that list of newly available file blocks.  (We can't allocate from
	 * that list immediately because the allocation might happen before our
	 * caller saves the new checkpoint information, and if we crashed before
	 * the new checkpoint location was saved, we'd have overwritten blocks
	 * still referenced by checkpoints in the system.)  In summary, there is
	 * a second step: after our caller saves the checkpoint information, we
	 * are called to add the newly available blocks into the live system's
	 * available list.
	 *
	 * This function is the first step, the second step is in the resolve
	 * function.
	 *
	 * If we're called to checkpoint the same file twice (without the second
	 * resolution step), or re-entered for any reason, it's an error in our
	 * caller, and our choices are all bad: leak blocks or potentially crash
	 * with our caller not yet having saved previous checkpoint information
	 * to stable storage.
	 */
	__wt_spin_lock(session, &block->live_lock);
	if (block->ckpt_inprogress)
		ret = __wt_block_panic(session, EINVAL,
		    "%s: unexpected checkpoint ordering", block->name);
	else
		block->ckpt_inprogress = true;
	__wt_spin_unlock(session, &block->live_lock);
	WT_RET(ret);

	/*
	 * Extents newly available as a result of deleting previous checkpoints
	 * are added to a list of extents.  The list should be empty, but as
	 * described above, there is no "free the checkpoint information" call
	 * into the block manager; if there was an error in an upper level that
	 * resulted in some previous checkpoint never being resolved, the list
	 * may not be empty.  We should have caught that with the "checkpoint
	 * in progress" test, but it doesn't cost us anything to be cautious.
	 *
	 * We free the checkpoint's allocation and discard extent lists as part
	 * of the resolution step, not because they're needed at that time, but
	 * because it's potentially a lot of work, and waiting allows the btree
	 * layer to continue eviction sooner.  As for the checkpoint-available
	 * list, make sure they get cleaned out.
	 */
	__wt_block_extlist_free(session, &ci->ckpt_avail);
	WT_RET(__wt_block_extlist_init(
	    session, &ci->ckpt_avail, "live", "ckpt_avail", true));
	__wt_block_extlist_free(session, &ci->ckpt_alloc);
	__wt_block_extlist_free(session, &ci->ckpt_discard);

	/*
	 * To delete a checkpoint, we'll need checkpoint information for it and
	 * the subsequent checkpoint into which it gets rolled; read them from
	 * disk before we lock things down.
	 */
	deleting = false;
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;
		deleting = true;

		/*
		 * Read the checkpoint and next checkpoint extent lists if we
		 * haven't already read them (we may have already read these
		 * extent blocks if there is more than one deleted checkpoint).
		 */
		if (ckpt->bpriv == NULL)
			WT_ERR(__ckpt_extlist_read(session, block, ckpt));

		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * The "next" checkpoint may be the live tree which has no
		 * extent blocks to read.
		 */
		if (next_ckpt->bpriv == NULL &&
		    !F_ISSET(next_ckpt, WT_CKPT_ADD))
			WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
	}

	/*
	 * Failures are now fatal: we can't currently back out the merge of any
	 * deleted checkpoint extent lists into the live system's extent lists,
	 * so continuing after error would leave the live system's extent lists
	 * corrupted for any subsequent checkpoint (and potentially, should a
	 * subsequent checkpoint succeed, for recovery).
	 */
	fatal = true;

	/*
	 * Hold a lock so the live extent lists and the file size can't change
	 * underneath us.  I suspect we'll tighten this if checkpoints take too
	 * much time away from real work: we read the historic checkpoint
	 * information without a lock, but we could also merge and re-write the
	 * deleted and merged checkpoint information without a lock, except for
	 * the final merge of ranges into the live tree.
	 */
	__wt_spin_lock(session, &block->live_lock);
	locked = true;

	/*
	 * We've allocated our last page, update the checkpoint size.  We need
	 * to calculate the live system's checkpoint size before merging
	 * checkpoint allocation and discard information from the checkpoints
	 * we're deleting, those operations change the underlying byte counts.
	 */
	ckpt_size = ci->ckpt_size;
	ckpt_size += ci->alloc.bytes;
	ckpt_size -= ci->discard.bytes;

	/* Skip the additional processing if we aren't deleting checkpoints. */
	if (!deleting)
		goto live_update;

	/*
	 * Delete any no-longer-needed checkpoints: we do this first as it frees
	 * blocks to the live lists, and the freed blocks will then be included
	 * when writing the live extent lists.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;

#ifdef HAVE_VERBOSE
		if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
			if (tmp == NULL)
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__ckpt_string(
			    session, block, ckpt->raw.data, tmp));
			__wt_verbose(session, WT_VERB_CHECKPOINT,
			    "%s: delete-checkpoint: %s: %s",
			    block->name, ckpt->name, (const char *)tmp->data);
		}
#endif
		/*
		 * Find the checkpoint into which we'll roll this checkpoint's
		 * blocks: it's the next real checkpoint in the list, and it
		 * better have been read in (if it's not the add slot).
		 */
		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * Set the from/to checkpoint structures, where the "to" value
		 * may be the live tree.
		 */
		a = ckpt->bpriv;
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			b = &block->live;
		else
			b = next_ckpt->bpriv;

		/*
		 * Free the root page: there's nothing special about this free,
		 * the root page is allocated using normal rules, that is, it
		 * may have been taken from the avail list, and was entered on
		 * the live system's alloc list at that time.  We free it into
		 * the checkpoint's discard list, however, not the live system's
		 * list because it appears on the checkpoint's alloc list and so
		 * must be paired in the checkpoint.
		 */
		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
			WT_ERR(__wt_block_insert_ext(session, block,
			    &a->discard, a->root_offset, a->root_size));

		/*
		 * Free the blocks used to hold the "from" checkpoint's extent
		 * lists, including the avail list.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));

		/*
		 * Roll the "from" alloc and discard extent lists into the "to"
		 * checkpoint's lists.
		 */
		if (a->alloc.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, block, &a->alloc, &b->alloc));
		if (a->discard.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, block, &a->discard, &b->discard));

		/*
		 * If the "to" checkpoint is also being deleted, we're done with
		 * it, it's merged into some other checkpoint in the next loop.
		 * This means the extent lists may aggregate over a number of
		 * checkpoints, but that's OK, they're disjoint sets of ranges.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
			continue;

		/*
		 * Find blocks for re-use: wherever the "to" checkpoint's
		 * allocate and discard lists overlap, move the range to
		 * the live system's checkpoint available list.
		 */
		WT_ERR(__wt_block_extlist_overlap(session, block, b));

		/*
		 * If we're updating the live system's information, we're done.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			continue;

		/*
		 * We have to write the "to" checkpoint's extent lists out in
		 * new blocks, and update its cookie.
		 *
		 * Free the blocks used to hold the "to" checkpoint's extent
		 * lists; don't include the avail list, it's not changing.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));

		F_SET(next_ckpt, WT_CKPT_UPDATE);
	}

	/* Update checkpoints marked for update. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_UPDATE))
			WT_ERR(__ckpt_update(
			    session, block, ckpt, ckpt->bpriv, false));

live_update:
	/* Truncate the file if that's possible. */
	WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));

	/* Update the final, added checkpoint based on the live system. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_ADD)) {
			/*
			 * !!!
			 * Our caller wants the final checkpoint size.  Setting
			 * the size here violates layering, but the alternative
			 * is a call for the btree layer to crack the checkpoint
			 * cookie into its components, and that's a fair amount
			 * of work.
			 */
			ckpt->ckpt_size = ckpt_size;

			/*
			 * Set the rolling checkpoint size for the live system.
			 * The current size includes the current checkpoint's
			 * root page size (root pages are on the checkpoint's
			 * block allocation list as root pages are allocated
			 * with the usual block allocation functions). That's
			 * correct, but we don't want to include it in the size
			 * for the next checkpoint.
			 */
			ckpt_size -= ci->root_size;

			/*
			 * Additionally, we had a bug for awhile where the live
			 * checkpoint size grew without bound. We can't sanity
			 * check the value, that would require walking the tree
			 * as part of the checkpoint. Bound any bug at the size
			 * of the file.
			 * It isn't practical to assert that the value is within
			 * bounds since databases created with older versions
			 * of WiredTiger (2.8.0) would likely see an error.
			 */
			ci->ckpt_size =
			    WT_MIN(ckpt_size, (uint64_t)block->size);

			WT_ERR(__ckpt_update(session, block, ckpt, ci, true));
		}

	/*
	 * Reset the live system's alloc and discard extent lists, leave the
	 * avail list alone.  This includes freeing a lot of extents, so do it
	 * outside of the system's lock by copying and resetting the original,
	 * then doing the work later.
	 */
	ci->ckpt_alloc = ci->alloc;
	WT_ERR(__wt_block_extlist_init(
	    session, &ci->alloc, "live", "alloc", false));
	ci->ckpt_discard = ci->discard;
	WT_ERR(__wt_block_extlist_init(
	    session, &ci->discard, "live", "discard", false));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * The first checkpoint in the system should always have an empty
	 * discard list.  If we've read that checkpoint and/or created it,
	 * check.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (!F_ISSET(ckpt, WT_CKPT_DELETE))
			break;
	if ((a = ckpt->bpriv) == NULL)
		a = &block->live;
	if (a->discard.entries != 0)
		WT_ERR_MSG(session, WT_ERROR,
		    "first checkpoint incorrectly has blocks on the discard "
		    "list");
#endif

err:	if (ret != 0 && fatal)
		ret = __wt_block_panic(session, ret,
		    "%s: fatal checkpoint failure", block->name);

	if (locked)
		__wt_spin_unlock(session, &block->live_lock);

	/* Discard any checkpoint information we loaded. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if ((ci = ckpt->bpriv) != NULL)
			__wt_block_ckpt_destroy(session, ci);

	__wt_scr_free(session, &tmp);
	return (ret);
}