Beispiel #1
0
/*
 * __wt_curfile_create --
 *	Open a cursor for a given btree handle.
 */
int
__wt_curfile_create(WT_SESSION_IMPL *session,
    WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
{
	static WT_CURSOR iface = {
		NULL,
		NULL,
		NULL,
		NULL,
		NULL,
		NULL,
		NULL,
		NULL,
		__curfile_compare,
		__curfile_next,
		__curfile_prev,
		__curfile_reset,
		__curfile_search,
		__curfile_search_near,
		__curfile_insert,
		__curfile_update,
		__curfile_remove,
		__curfile_close,
		{ NULL, NULL },		/* TAILQ_ENTRY q */
		0,			/* recno key */
		{ 0 },			/* recno raw buffer */
		{ NULL, 0, 0, NULL, 0 },/* WT_ITEM key */
		{ NULL, 0, 0, NULL, 0 },/* WT_ITEM value */
		0,			/* int saved_err */
		0			/* uint32_t flags */
	};
	WT_BTREE *btree;
	WT_CONFIG_ITEM cval;
	WT_CURSOR *cursor;
	WT_CURSOR_BTREE *cbt;
	WT_DECL_RET;
	size_t csize;
	int bitmap, bulk;

	cbt = NULL;

	btree = session->btree;
	WT_ASSERT(session, btree != NULL);

	WT_RET(__wt_config_gets_defno(session, cfg, "bulk", &cval));
	if ((cval.type == ITEM_ID || cval.type == ITEM_STRING) &&
	    WT_STRING_MATCH("bitmap", cval.str, cval.len))
		bitmap = bulk = 1;
	else {
		bitmap = 0;
		bulk = (cval.val != 0);
	}

	csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE);
	WT_RET(__wt_calloc(session, 1, csize, &cbt));

	cursor = &cbt->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	cursor->uri = btree->name;
	cursor->key_format = btree->key_format;
	cursor->value_format = btree->value_format;

	cbt->btree = session->btree;
	if (bulk)
		WT_ERR(__wt_curbulk_init((WT_CURSOR_BULK *)cbt, bitmap));

	/*
	 * no_cache
	 * No cache cursors are read-only.
	 */
	WT_ERR(__wt_config_gets_defno(session, cfg, "no_cache", &cval));
	if (cval.val != 0) {
		cursor->insert = __wt_cursor_notsup;
		cursor->update = __wt_cursor_notsup;
		cursor->remove = __wt_cursor_notsup;
	}

	/*
	 * random_retrieval
	 * Random retrieval cursors only support next, reset and close.
	 */
	WT_ERR(__wt_config_gets_defno(session, cfg, "next_random", &cval));
	if (cval.val != 0) {
		__wt_cursor_set_notsup(cursor);
		cursor->next = __curfile_next_random;
		cursor->reset = __curfile_reset;
	}

	/* __wt_cursor_init is last so we don't have to clean up on error. */
	STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0);
	WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp));

	if (0) {
err:		__wt_free(session, cbt);
	}

	return (ret);
}
Beispiel #2
0
/*
 * __lsm_bloom_create --
 *	Create a bloom filter for a chunk of the LSM tree that has been
 *	checkpointed but not yet been merged.
 */
static int
__lsm_bloom_create(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off)
{
	WT_BLOOM *bloom;
	WT_CURSOR *src;
	WT_DECL_RET;
	WT_ITEM key;
	uint64_t insert_count;

	WT_RET(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk));

	bloom = NULL;
	/*
	 * This is merge-like activity, and we don't want compacts to give up
	 * because we are creating a bunch of bloom filters before merging.
	 */
	++lsm_tree->merge_progressing;
	WT_RET(__wt_bloom_create(session, chunk->bloom_uri,
	    lsm_tree->bloom_config, chunk->count,
	    lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom));

	/* Open a special merge cursor just on this chunk. */
	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
	F_SET(src, WT_CURSTD_RAW);
	WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));

	/*
	 * Setup so that we don't hold pages we read into cache, and so
	 * that we don't get stuck if the cache is full. If we allow
	 * ourselves to get stuck creating bloom filters, the entire tree
	 * can stall since there may be no worker threads available to flush.
	 */
	F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
		WT_ERR(src->get_key(src, &key));
		__wt_bloom_insert(bloom, &key);
	}
	WT_ERR_NOTFOUND_OK(ret);
	WT_TRET(src->close(src));

	WT_TRET(__wt_bloom_finalize(bloom));
	WT_ERR(ret);

	F_CLR(session, WT_SESSION_NO_CACHE);

	/* Load the new Bloom filter into cache. */
	WT_CLEAR(key);
	WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));

	__wt_verbose(session, WT_VERB_LSM,
	    "LSM worker created bloom filter %s. "
	    "Expected %" PRIu64 " items, got %" PRIu64,
	    chunk->bloom_uri, chunk->count, insert_count);

	/* Ensure the bloom filter is in the metadata. */
	__wt_lsm_tree_writelock(session, lsm_tree);
	F_SET(chunk, WT_LSM_CHUNK_BLOOM);
	ret = __wt_lsm_meta_write(session, lsm_tree);
	++lsm_tree->dsk_gen;
	__wt_lsm_tree_writeunlock(session, lsm_tree);

	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM bloom worker metadata write");

err:	if (bloom != NULL)
		WT_TRET(__wt_bloom_close(bloom));
	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
	return (ret);
}
Beispiel #3
0
/*
 * __wt_lsm_get_chunk_to_flush --
 *	Find and pin a chunk in the LSM tree that is likely to need flushing.
 */
int
__wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, bool force, WT_LSM_CHUNK **chunkp)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk, *evict_chunk, *flush_chunk;
	uint32_t i;

	*chunkp = NULL;
	chunk = evict_chunk = flush_chunk = NULL;

	WT_ASSERT(session, lsm_tree->queue_ref > 0);
	__wt_lsm_tree_readlock(session, lsm_tree);
	if (!lsm_tree->active || lsm_tree->nchunks == 0) {
		__wt_lsm_tree_readunlock(session, lsm_tree);
		return (0);
	}

	/* Search for a chunk to evict and/or a chunk to flush. */
	for (i = 0; i < lsm_tree->nchunks; i++) {
		chunk = lsm_tree->chunk[i];
		if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
			/*
			 * Normally we don't want to force out the last chunk.
			 * But if we're doing a forced flush on behalf of a
			 * compact, then we want to include the final chunk.
			 */
			if (evict_chunk == NULL &&
			    !chunk->evicted &&
			    !F_ISSET(chunk, WT_LSM_CHUNK_STABLE))
				evict_chunk = chunk;
		} else if (flush_chunk == NULL &&
		    chunk->switch_txn != 0 &&
		    (force || i < lsm_tree->nchunks - 1))
			flush_chunk = chunk;
	}

	/*
	 * Don't be overly zealous about pushing old chunks from cache.
	 * Attempting too many drops can interfere with checkpoints.
	 *
	 * If retrying a discard push an additional work unit so there are
	 * enough to trigger checkpoints.
	 */
	if (evict_chunk != NULL && flush_chunk != NULL) {
		chunk = (__wt_random(&session->rnd) & 1) ?
		    evict_chunk : flush_chunk;
		WT_ERR(__wt_lsm_manager_push_entry(
		    session, WT_LSM_WORK_FLUSH, 0, lsm_tree));
	} else
		chunk = (evict_chunk != NULL) ? evict_chunk : flush_chunk;

	if (chunk != NULL) {
		__wt_verbose(session, WT_VERB_LSM,
		    "Flush%s: return chunk %" PRIu32 " of %" PRIu32 ": %s",
		    force ? " w/ force" : "",
		    i, lsm_tree->nchunks, chunk->uri);

		(void)__wt_atomic_add32(&chunk->refcnt, 1);
	}

err:	__wt_lsm_tree_readunlock(session, lsm_tree);

	*chunkp = chunk;
	return (ret);
}
Beispiel #4
0
/*
 * __create_file --
 *	Create a new 'file:' object.
 */
static int
__create_file(WT_SESSION_IMPL *session,
    const char *uri, int exclusive, const char *config)
{
	WT_DECL_ITEM(val);
	WT_DECL_RET;
	uint32_t allocsize;
	int is_metadata;
	const char *filename, **p, *filecfg[] =
	    { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL };
	char *fileconf;

	fileconf = NULL;

	is_metadata = strcmp(uri, WT_METAFILE_URI) == 0;

	filename = uri;
	if (!WT_PREFIX_SKIP(filename, "file:"))
		WT_RET_MSG(session, EINVAL, "Expected a 'file:' URI: %s", uri);

	/* Check if the file already exists. */
	if (!is_metadata && (ret =
	    __wt_metadata_search(session, uri, &fileconf)) != WT_NOTFOUND) {
		if (exclusive)
			WT_TRET(EEXIST);
		goto err;
	}

	/* Sanity check the allocation size. */
	WT_RET(__wt_direct_io_size_check(
	    session, filecfg, "allocation_size", &allocsize));

	/* Create the file. */
	WT_ERR(__wt_block_manager_create(session, filename, allocsize));
	if (WT_META_TRACKING(session))
		WT_ERR(__wt_meta_track_fileop(session, NULL, uri));

	/*
	 * If creating an ordinary file, append the file ID and current version
	 * numbers to the passed-in configuration and insert the resulting
	 * configuration into the metadata.
	 */
	if (!is_metadata) {
		WT_ERR(__wt_scr_alloc(session, 0, &val));
		WT_ERR(__wt_buf_fmt(session, val,
		    "id=%" PRIu32 ",version=(major=%d,minor=%d)",
		    ++S2C(session)->next_file_id,
		    WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX));
		for (p = filecfg; *p != NULL; ++p)
			;
		*p = val->data;
		WT_ERR(__wt_config_collapse(session, filecfg, &fileconf));
		WT_ERR(__wt_metadata_insert(session, uri, fileconf));
	}

	/*
	 * Open the file to check that it was setup correctly. We don't need to
	 * pass the configuration, we just wrote the collapsed configuration
	 * into the metadata file, and it's going to be read/used by underlying
	 * functions.
	 *
	 * Keep the handle exclusive until it is released at the end of the
	 * call, otherwise we could race with a drop.
	 */
	WT_ERR(__wt_session_get_btree(
	    session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
	if (WT_META_TRACKING(session))
		WT_ERR(__wt_meta_track_handle_lock(session, 1));
	else
		WT_ERR(__wt_session_release_btree(session));

err:	__wt_scr_free(session, &val);
	__wt_free(session, fileconf);
	return (ret);
}
Beispiel #5
0
/*
 * __wt_lsm_work_bloom --
 *	Try to create a Bloom filter for the newest on-disk chunk that doesn't
 *	have one.
 */
int
__wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_WORKER_COOKIE cookie;
	u_int i, merge;

	WT_CLEAR(cookie);

	WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, false));

	/* Create bloom filters in all checkpointed chunks. */
	merge = 0;
	for (i = 0; i < cookie.nchunks; i++) {
		chunk = cookie.chunk_array[i];

		/*
		 * Skip if a thread is still active in the chunk or it
		 * isn't suitable.
		 */
		if (!F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ||
		    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM | WT_LSM_CHUNK_MERGING) ||
		    chunk->generation > 0 ||
		    chunk->count == 0)
			continue;

		/* Never create a bloom filter on the oldest chunk */
		if (chunk == lsm_tree->chunk[0] &&
		    !FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))
			continue;
		/*
		 * See if we win the race to switch on the "busy" flag and
		 * recheck that the chunk still needs a Bloom filter.
		 */
		if (__wt_atomic_cas32(&chunk->bloom_busy, 0, 1)) {
			if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
				ret = __lsm_bloom_create(
				    session, lsm_tree, chunk, (u_int)i);
				/*
				 * Record if we were successful so that we can
				 * later push a merge work unit.
				 */
				if (ret == 0)
					merge = 1;
			}
			chunk->bloom_busy = 0;
			break;
		}
	}
	/*
	 * If we created any bloom filters, we push a merge work unit now.
	 */
	if (merge)
		WT_ERR(__wt_lsm_manager_push_entry(
		    session, WT_LSM_WORK_MERGE, 0, lsm_tree));

err:
	__lsm_unpin_chunks(session, &cookie);
	__wt_free(session, cookie.chunk_array);
	return (ret);
}
Beispiel #6
0
/*
 * __open_verbose --
 *	Optionally output a verbose message on handle open.
 */
static inline int
__open_verbose(WT_SESSION_IMPL *session,
               const char *name, uint32_t file_type, uint32_t flags)
{
#ifdef HAVE_VERBOSE
    WT_DECL_RET;
    WT_DECL_ITEM(tmp);
    const char *file_type_tag, *sep;

    if (!WT_VERBOSE_ISSET(session, WT_VERB_FILEOPS))
        return (0);

    /*
     * It's useful to track file opens when debugging platforms, take some
     * effort to output good tracking information.
     */

    switch (file_type) {
    case WT_FILE_TYPE_CHECKPOINT:
        file_type_tag = "checkpoint";
        break;
    case WT_FILE_TYPE_DATA:
        file_type_tag = "data";
        break;
    case WT_FILE_TYPE_DIRECTORY:
        file_type_tag = "directory";
        break;
    case WT_FILE_TYPE_LOG:
        file_type_tag = "log";
        break;
    case WT_FILE_TYPE_REGULAR:
        file_type_tag = "regular";
        break;
    default:
        file_type_tag = "unknown open type";
        break;
    }

    WT_RET(__wt_scr_alloc(session, 0, &tmp));
    sep = " (";
#define	WT_OPEN_VERBOSE_FLAG(f, name)					\
	if (LF_ISSET(f)) {						\
		WT_ERR(__wt_buf_catfmt(					\
		    session, tmp, "%s%s", sep, name));			\
		sep = ", ";						\
	}

    WT_OPEN_VERBOSE_FLAG(WT_OPEN_CREATE, "create");
    WT_OPEN_VERBOSE_FLAG(WT_OPEN_EXCLUSIVE, "exclusive");
    WT_OPEN_VERBOSE_FLAG(WT_OPEN_FIXED, "fixed");
    WT_OPEN_VERBOSE_FLAG(WT_OPEN_READONLY, "readonly");
    WT_OPEN_VERBOSE_FLAG(WT_STREAM_APPEND, "stream-append");
    WT_OPEN_VERBOSE_FLAG(WT_STREAM_READ, "stream-read");
    WT_OPEN_VERBOSE_FLAG(WT_STREAM_WRITE, "stream-write");

    if (tmp->size != 0)
        WT_ERR(__wt_buf_catfmt(session, tmp, ")"));

    ret = __wt_verbose(session, WT_VERB_FILEOPS,
                       "%s: handle-open: type %s%s",
                       name, file_type_tag, tmp->size == 0 ? "" : (char *)tmp->data);

err:
    __wt_scr_free(session, &tmp);
    return (ret);
#else
    WT_UNUSED(session);
    WT_UNUSED(name);
    WT_UNUSED(file_type);
    WT_UNUSED(flags);
    return (0);
#endif
}
Beispiel #7
0
/*
 * __create_index --
 *	Create an index.
 */
static int
__create_index(WT_SESSION_IMPL *session,
    const char *name, int exclusive, const char *config)
{
	WT_CONFIG kcols, pkcols;
	WT_CONFIG_ITEM ckey, cval, icols, kval;
	WT_DECL_PACK_VALUE(pv);
	WT_DECL_RET;
	WT_ITEM confbuf, extra_cols, fmt, namebuf;
	WT_PACK pack;
	WT_TABLE *table;
	const char *cfg[4] =
	    { WT_CONFIG_BASE(session, index_meta), NULL, NULL, NULL };
	const char *sourcecfg[] = { config, NULL, NULL };
	const char *source, *sourceconf, *idxname, *tablename;
	char *idxconf;
	size_t tlen;
	int have_extractor;
	u_int i, npublic_cols;

	sourceconf = NULL;
	idxconf = NULL;
	WT_CLEAR(confbuf);
	WT_CLEAR(fmt);
	WT_CLEAR(extra_cols);
	WT_CLEAR(namebuf);
	have_extractor = 0;

	tablename = name;
	if (!WT_PREFIX_SKIP(tablename, "index:"))
		return (EINVAL);
	idxname = strchr(tablename, ':');
	if (idxname == NULL)
		WT_RET_MSG(session, EINVAL, "Invalid index name, "
		    "should be <table name>:<index name>: %s", name);

	tlen = (size_t)(idxname++ - tablename);
	if ((ret =
	    __wt_schema_get_table(session, tablename, tlen, 1, &table)) != 0)
		WT_RET_MSG(session, ret,
		    "Can't create an index for a non-existent table: %.*s",
		    (int)tlen, tablename);

	if (table->is_simple)
		WT_RET_MSG(session, EINVAL,
		    "%s requires a table with named columns", name);

	if (__wt_config_getones(session, config, "source", &cval) == 0) {
		WT_ERR(__wt_buf_fmt(session, &namebuf,
		    "%.*s", (int)cval.len, cval.str));
		source = namebuf.data;
	} else {
		WT_ERR(__wt_schema_index_source(
		    session, table, idxname, config, &namebuf));
		source = namebuf.data;

		/* Add the source name to the index config before collapsing. */
		WT_ERR(__wt_buf_catfmt(session, &confbuf,
		    ",source=\"%s\"", source));
	}

	if (__wt_config_getones_none(
	    session, config, "extractor", &cval) == 0 && cval.len != 0) {
		have_extractor = 1;
		/* Custom extractors must supply a key format. */
		if ((ret = __wt_config_getones(
		    session, config, "key_format", &kval)) != 0)
			WT_ERR_MSG(session, EINVAL,
			    "%s: custom extractors require a key_format", name);
	}

	/* Calculate the key/value formats. */
	WT_CLEAR(icols);
	if (__wt_config_getones(session, config, "columns", &icols) != 0 &&
	    !have_extractor)
		WT_ERR_MSG(session, EINVAL,
		    "%s: requires 'columns' configuration", name);

	/*
	 * Count the public columns using the declared columns for normal
	 * indices or the key format for custom extractors.
	 */
	npublic_cols = 0;
	if (!have_extractor) {
		WT_ERR(__wt_config_subinit(session, &kcols, &icols));
		while ((ret = __wt_config_next(&kcols, &ckey, &cval)) == 0)
			++npublic_cols;
		WT_ERR_NOTFOUND_OK(ret);
	} else {
		WT_ERR(__pack_initn(session, &pack, kval.str, kval.len));
		while ((ret = __pack_next(&pack, &pv)) == 0)
			++npublic_cols;
		WT_ERR_NOTFOUND_OK(ret);
	}

	/*
	 * The key format for an index is somewhat subtle: the application
	 * specifies a set of columns that it will use for the key, but the
	 * engine usually adds some hidden columns in order to derive the
	 * primary key.  These hidden columns are part of the source's
	 * key_format, which we are calculating now, but not part of an index
	 * cursor's key_format.
	 */
	WT_ERR(__wt_config_subinit(session, &pkcols, &table->colconf));
	for (i = 0; i < table->nkey_columns &&
	    (ret = __wt_config_next(&pkcols, &ckey, &cval)) == 0;
	    i++) {
		/*
		 * If the primary key column is already in the secondary key,
		 * don't add it again.
		 */
		if (__wt_config_subgetraw(session, &icols, &ckey, &cval) == 0) {
			if (have_extractor)
				WT_ERR_MSG(session, EINVAL,
				    "an index with a custom extractor may not "
				    "include primary key columns");
			continue;
		}
		WT_ERR(__wt_buf_catfmt(
		    session, &extra_cols, "%.*s,", (int)ckey.len, ckey.str));
	}
	if (ret != 0 && ret != WT_NOTFOUND)
		goto err;

	/* Index values are empty: all columns are packed into the index key. */
	WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,key_format="));

	if (have_extractor) {
		WT_ERR(__wt_buf_catfmt(session, &fmt, "%.*s",
		    (int)kval.len, kval.str));
		WT_CLEAR(icols);
	}

	/*
	 * Construct the index key format, or append the primary key columns
	 * for custom extractors.
	 */
	WT_ERR(__wt_struct_reformat(session, table,
	    icols.str, icols.len, (const char *)extra_cols.data, 0, &fmt));

	/* Check for a record number index key, which makes no sense. */
	WT_ERR(__wt_config_getones(session, fmt.data, "key_format", &cval));
	if (cval.len == 1 && cval.str[0] == 'r')
		WT_ERR_MSG(session, EINVAL,
		    "column-store index may not use the record number as its "
		    "index key");

	WT_ERR(__wt_buf_catfmt(
	    session, &fmt, ",index_key_columns=%u", npublic_cols));

	sourcecfg[1] = fmt.data;
	WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf));

	WT_ERR(__wt_schema_create(session, source, sourceconf));

	cfg[1] = sourceconf;
	cfg[2] = confbuf.data;
	WT_ERR(__wt_config_collapse(session, cfg, &idxconf));
	if ((ret = __wt_metadata_insert(session, name, idxconf)) != 0) {
		/*
		 * If the entry already exists in the metadata, we're done.
		 * This is an error for exclusive creates but okay otherwise.
		 */
		if (ret == WT_DUPLICATE_KEY)
			ret = exclusive ? EEXIST : 0;
		goto err;
	}

	/* Make sure that the configuration is valid. */
	WT_ERR(__wt_schema_open_index(
	    session, table, idxname, strlen(idxname), NULL));

err:	__wt_free(session, idxconf);
	__wt_free(session, sourceconf);
	__wt_buf_free(session, &confbuf);
	__wt_buf_free(session, &extra_cols);
	__wt_buf_free(session, &fmt);
	__wt_buf_free(session, &namebuf);

	__wt_schema_release_table(session, table);
	return (ret);
}
Beispiel #8
0
/*
 * __lsm_bloom_create --
 *	Create a bloom filter for a chunk of the LSM tree that has been
 *	checkpointed but not yet been merged.
 */
static int
__lsm_bloom_create(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off)
{
	WT_BLOOM *bloom;
	WT_CURSOR *src;
	WT_DECL_RET;
	WT_ITEM buf, key;
	WT_SESSION *wt_session;
	uint64_t insert_count;
	int exist;

	/*
	 * Normally, the Bloom URI is populated when the chunk struct is
	 * allocated.  After an open, however, it may not have been.
	 * Deal with that here.
	 */
	if (chunk->bloom_uri == NULL) {
		WT_CLEAR(buf);
		WT_RET(__wt_lsm_tree_bloom_name(
		    session, lsm_tree, chunk->id, &buf));
		chunk->bloom_uri = __wt_buf_steal(session, &buf, NULL);
	}

	/*
	 * Drop the bloom filter first - there may be some content hanging over
	 * from an aborted merge or checkpoint.
	 */
	wt_session = &session->iface;
	WT_RET(__wt_exist(session, chunk->bloom_uri + strlen("file:"), &exist));
	if (exist)
		WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force"));

	bloom = NULL;
	/*
	 * This is merge-like activity, and we don't want compacts to give up
	 * because we are creating a bunch of bloom filters before merging.
	 */
	++lsm_tree->merge_progressing;
	WT_RET(__wt_bloom_create(session, chunk->bloom_uri,
	    lsm_tree->bloom_config, chunk->count,
	    lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom));

	/* Open a special merge cursor just on this chunk. */
	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
	F_SET(src, WT_CURSTD_RAW);
	WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));

	F_SET(session, WT_SESSION_NO_CACHE);
	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
		WT_ERR(src->get_key(src, &key));
		WT_ERR(__wt_bloom_insert(bloom, &key));
	}
	WT_ERR_NOTFOUND_OK(ret);
	WT_TRET(src->close(src));

	WT_TRET(__wt_bloom_finalize(bloom));
	WT_ERR(ret);

	F_CLR(session, WT_SESSION_NO_CACHE);

	/* Load the new Bloom filter into cache. */
	WT_CLEAR(key);
	WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));

	WT_VERBOSE_ERR(session, lsm,
	    "LSM worker created bloom filter %s. "
	    "Expected %" PRIu64 " items, got %" PRIu64,
	    chunk->bloom_uri, chunk->count, insert_count);

	/* Ensure the bloom filter is in the metadata. */
	WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1));
	F_SET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM);
	ret = __wt_lsm_meta_write(session, lsm_tree);
	++lsm_tree->dsk_gen;
	WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));

	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM bloom worker metadata write");

err:	if (bloom != NULL)
		WT_TRET(__wt_bloom_close(bloom));
	F_CLR(session, WT_SESSION_NO_CACHE);
	return (ret);
}
Beispiel #9
0
/*
 * __lsm_free_chunks --
 *	Try to drop chunks from the tree that are no longer required.
 */
static int
__lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_WORKER_COOKIE cookie;
	u_int i, skipped;
	int progress;

	/*
	 * Take a copy of the current state of the LSM tree and look for chunks
	 * to drop.  We do it this way to avoid holding the LSM tree lock while
	 * doing I/O or waiting on the schema lock.
	 *
	 * This is safe because only one thread will be in this function at a
	 * time (the first merge thread).  Merges may complete concurrently,
	 * and the old_chunks array may be extended, but we shuffle down the
	 * pointers each time we free one to keep the non-NULL slots at the
	 * beginning of the array.
	 */
	WT_CLEAR(cookie);
	WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 1));
	for (i = skipped = 0, progress = 0; i < cookie.nchunks; i++) {
		chunk = cookie.chunk_array[i];
		WT_ASSERT(session, chunk != NULL);
		/* Skip the chunk if another worker is using it. */
		if (chunk->refcnt > 1) {
			++skipped;
			continue;
		}

		if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM)) {
			/*
			 * An EBUSY return is acceptable - a cursor may still
			 * be positioned on this old chunk.
			 */
			if ((ret = __lsm_drop_file(
			    session, chunk->bloom_uri)) == EBUSY) {
				WT_VERBOSE_ERR(session, lsm,
				    "LSM worker bloom drop busy: %s.",
				    chunk->bloom_uri);
				++skipped;
				continue;
			} else
				WT_ERR(ret);

			F_CLR_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM);
		}
		if (chunk->uri != NULL) {
			/*
			 * An EBUSY return is acceptable - a cursor may still
			 * be positioned on this old chunk.
			 */
			if ((ret = __lsm_drop_file(
			    session, chunk->uri)) == EBUSY) {
				WT_VERBOSE_ERR(session, lsm,
				    "LSM worker drop busy: %s.",
				    chunk->uri);
				++skipped;
				continue;
			} else
				WT_ERR(ret);
		}

		progress = 1;

		/* Lock the tree to clear out the old chunk information. */
		WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1));

		/*
		 * The chunk we are looking at should be the first one in the
		 * tree that we haven't already skipped over.
		 */
		WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk);
		__wt_free(session, chunk->bloom_uri);
		__wt_free(session, chunk->uri);
		__wt_free(session, lsm_tree->old_chunks[skipped]);

		/* Shuffle down to keep all occupied slots at the beginning. */
		if (--lsm_tree->nold_chunks > skipped) {
			memmove(lsm_tree->old_chunks + skipped,
			    lsm_tree->old_chunks + skipped + 1,
			    (lsm_tree->nold_chunks - skipped) *
			    sizeof(WT_LSM_CHUNK *));
			lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL;
		}
		/*
		 * Clear the chunk in the cookie so we don't attempt to
		 * decrement the reference count.
		 */
		cookie.chunk_array[i] = NULL;

		/*
		 * Update the metadata.  We used to try to optimize by only
		 * updating the metadata once at the end, but the error
		 * handling is not straightforward.
		 */
		WT_TRET(__wt_lsm_meta_write(session, lsm_tree));
		WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree));
	}

err:	__lsm_unpin_chunks(session, &cookie);
	__wt_free(session, cookie.chunk_array);

	/* Returning non-zero means there is no work to do. */
	if (!progress)
		WT_TRET(WT_NOTFOUND);

	return (ret);
}
Beispiel #10
0
/*
 * __wt_btcur_prev --
 *	Move to the previous record in the tree.
 */
int
__wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating)
{
	WT_DECL_RET;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	uint32_t flags;
	int skipped, newpage;

	session = (WT_SESSION_IMPL *)cbt->iface.session;

	WT_STAT_FAST_CONN_INCR(session, cursor_prev);
	WT_STAT_FAST_DATA_INCR(session, cursor_prev);

	flags = WT_READ_PREV | WT_READ_SKIP_INTL;	/* Tree walk flags. */
	if (truncating)
		LF_SET(WT_READ_TRUNCATE);

	WT_RET(__cursor_func_init(cbt, 0));

	/*
	 * If we aren't already iterating in the right direction, there's
	 * some setup to do.
	 */
	if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
		__wt_btcur_iterate_setup(cbt, 0);

	/*
	 * Walk any page we're holding until the underlying call returns not-
	 * found.  Then, move to the previous page, until we reach the start
	 * of the file.
	 */
	for (skipped = newpage = 0;; skipped = 0, newpage = 1) {
		page = cbt->ref == NULL ? NULL : cbt->ref->page;
		WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page));

		/*
		 * The last page in a column-store has appended entries.
		 * We handle it separately from the usual cursor code:
		 * it's only that one page and it's in a simple format.
		 */
		if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF &&
		    (cbt->ins_head = WT_COL_APPEND(page)) != NULL)
			F_SET(cbt, WT_CBT_ITERATE_APPEND);

		if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_append_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_append_prev(
				    cbt, newpage, &skipped);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret == 0)
				break;
			F_CLR(cbt, WT_CBT_ITERATE_APPEND);
			if (ret != WT_NOTFOUND)
				break;
			newpage = 1;
		}
		if (page != NULL) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_prev(cbt, newpage, &skipped);
				break;
			case WT_PAGE_ROW_LEAF:
				ret = __cursor_row_prev(cbt, newpage, &skipped);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret != WT_NOTFOUND)
				break;
		}

		if (newpage && skipped)
			page->read_gen = WT_READGEN_OLDEST;

		WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
		WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
	}

err:	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	return (ret);
}
Beispiel #11
0
/*
 * __wt_lsm_checkpoint_worker --
 *	A worker thread for an LSM tree, responsible for flushing new chunks to
 *	disk.
 */
void *
__wt_lsm_checkpoint_worker(void *arg)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	WT_LSM_WORKER_COOKIE cookie;
	WT_SESSION_IMPL *session;
	WT_TXN_ISOLATION saved_isolation;
	u_int i, j;
	int locked;

	lsm_tree = arg;
	session = lsm_tree->ckpt_session;

	WT_CLEAR(cookie);

	while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) {
		if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
			WT_WITH_SCHEMA_LOCK(session, ret =
			    __wt_lsm_tree_switch(session, lsm_tree));
			WT_ERR(ret);
		}

		WT_ERR(__lsm_copy_chunks(session, lsm_tree, &cookie, 0));

		/* Write checkpoints in all completed files. */
		for (i = 0, j = 0; i < cookie.nchunks - 1; i++) {
			if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING))
				goto err;

			if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
				break;

			chunk = cookie.chunk_array[i];

			/* Stop if a running transaction needs the chunk. */
			__wt_txn_update_oldest(session);
			if (!__wt_txn_visible_all(session, chunk->txnid_max))
				break;

			/*
			 * If the chunk is already checkpointed, make sure it
			 * is also evicted.  Either way, there is no point
			 * trying to checkpoint it again.
			 */
			if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK)) {
				if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_EVICTED))
					continue;

				if ((ret = __lsm_discard_handle(
				    session, chunk->uri, NULL)) == 0)
					F_SET_ATOMIC(
					    chunk, WT_LSM_CHUNK_EVICTED);
				else if (ret == EBUSY)
					ret = 0;
				else
					WT_ERR_MSG(session, ret,
					    "discard handle");
				continue;
			}

			WT_VERBOSE_ERR(session, lsm,
			     "LSM worker flushing %u", i);

			/*
			 * Flush the file before checkpointing: this is the
			 * expensive part in terms of I/O: do it without
			 * holding the schema lock.
			 *
			 * Use the special eviction isolation level to avoid
			 * interfering with an application checkpoint: we have
			 * already checked that all of the updates in this
			 * chunk are globally visible.
			 *
			 * !!! We can wait here for checkpoints and fsyncs to
			 * complete, which can be a long time.
			 *
			 * Don't keep waiting for the lock if application
			 * threads are waiting for a switch.  Don't skip
			 * flushing the leaves either: that just means we'll
			 * hold the schema lock for (much) longer, which blocks
			 * the world.
			 */
			WT_ERR(__wt_session_get_btree(
			    session, chunk->uri, NULL, NULL, 0));
			for (locked = 0;
			    !locked && ret == 0 &&
			    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);) {
				if ((ret = __wt_spin_trylock(session,
				    &S2C(session)->checkpoint_lock)) == 0)
					locked = 1;
				else if (ret == EBUSY) {
					__wt_yield();
					ret = 0;
				}
			}
			if (locked) {
				saved_isolation = session->txn.isolation;
				session->txn.isolation = TXN_ISO_EVICTION;
				ret = __wt_bt_cache_op(
				    session, NULL, WT_SYNC_WRITE_LEAVES);
				session->txn.isolation = saved_isolation;
				__wt_spin_unlock(
				    session, &S2C(session)->checkpoint_lock);
			}
			WT_TRET(__wt_session_release_btree(session));
			WT_ERR(ret);

			if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
				break;

			WT_VERBOSE_ERR(session, lsm,
			     "LSM worker checkpointing %u", i);

			WT_WITH_SCHEMA_LOCK(session,
			    ret = __wt_schema_worker(session, chunk->uri,
			    __wt_checkpoint, NULL, NULL, 0));

			if (ret != 0) {
				__wt_err(session, ret, "LSM checkpoint");
				break;
			}

			WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
			/*
			 * Clear the "cache resident" flag so the primary can
			 * be evicted and eventually closed.  Only do this once
			 * the checkpoint has succeeded: otherwise, accessing
			 * the leaf page during the checkpoint can trigger
			 * forced eviction.
			 */
			WT_ERR(__wt_session_get_btree(
			    session, chunk->uri, NULL, NULL, 0));
			__wt_btree_evictable(session, 1);
			WT_ERR(__wt_session_release_btree(session));

			++j;
			WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1));
			F_SET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK);
			ret = __wt_lsm_meta_write(session, lsm_tree);
			++lsm_tree->dsk_gen;

			/* Update the throttle time. */
			__wt_lsm_tree_throttle(session, lsm_tree);
			WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));

			/* Make sure we aren't pinning a transaction ID. */
			__wt_txn_release_snapshot(session);

			if (ret != 0) {
				__wt_err(session, ret,
				    "LSM checkpoint metadata write");
				break;
			}

			WT_VERBOSE_ERR(session, lsm,
			     "LSM worker checkpointed %u", i);
		}
		__lsm_unpin_chunks(session, &cookie);
		if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) &&
		    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
			WT_ERR_TIMEDOUT_OK(__wt_cond_wait(
			    session, lsm_tree->work_cond, 100000));
	}
err:	__lsm_unpin_chunks(session, &cookie);
	__wt_free(session, cookie.chunk_array);
	/*
	 * The thread will only exit with failure if we run out of memory or
	 * there is some other system driven failure. We can't keep going
	 * after such a failure - ensure WiredTiger shuts down.
	 */
	if (ret != 0 && ret != WT_NOTFOUND)
		WT_PANIC_ERR(session, ret,
		    "Shutting down LSM checkpoint utility thread");
	return (NULL);
}
Beispiel #12
0
/*
 * __wt_txn_recover --
 *	Run recovery.
 */
int
__wt_txn_recover(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *metac;
	WT_DECL_RET;
	WT_RECOVERY r;
	struct WT_RECOVERY_FILE *metafile;
	char *config;
	int needs_rec, was_backup;

	conn = S2C(session);
	WT_CLEAR(r);
	WT_INIT_LSN(&r.ckpt_lsn);
	was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0;

	/* We need a real session for recovery. */
	WT_RET(__wt_open_session(conn, NULL, NULL, &session));
	F_SET(session, WT_SESSION_NO_LOGGING);
	r.session = session;

	WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
	WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
	WT_ERR(__wt_metadata_cursor(session, NULL, &metac));
	metafile = &r.files[WT_METAFILE_ID];
	metafile->c = metac;

	/*
	 * If no log was found (including if logging is disabled), or if the
	 * last checkpoint was done with logging disabled, recovery should not
	 * run.  Scan the metadata to figure out the largest file ID.
	 */
	if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) ||
	    WT_IS_MAX_LSN(&metafile->ckpt_lsn)) {
		WT_ERR(__recovery_file_scan(&r));
		conn->next_file_id = r.max_fileid;
		goto done;
	}

	/*
	 * First, do a pass through the log to recover the metadata, and
	 * establish the last checkpoint LSN.  Skip this when opening a hot
	 * backup: we already have the correct metadata in that case.
	 */
	if (!was_backup) {
		r.metadata_only = 1;
		if (WT_IS_INIT_LSN(&metafile->ckpt_lsn))
			WT_ERR(__wt_log_scan(session,
			    NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r));
		else {
			/*
			 * Start at the last checkpoint LSN referenced in the
			 * metadata.  If we see the end of a checkpoint while
			 * scanning, we will change the full scan to start from
			 * there.
			 */
			r.ckpt_lsn = metafile->ckpt_lsn;
			WT_ERR(__wt_log_scan(session,
			    &metafile->ckpt_lsn, 0, __txn_log_recover, &r));
		}
	}

	/* Scan the metadata to find the live files and their IDs. */
	WT_ERR(__recovery_file_scan(&r));

	/*
	 * We no longer need the metadata cursor: close it to avoid pinning any
	 * resources that could block eviction during recovery.
	 */
	r.files[0].c = NULL;
	WT_ERR(metac->close(metac));

	/*
	 * Now, recover all the files apart from the metadata.
	 * Pass WT_LOGSCAN_RECOVER so that old logs get truncated.
	 */
	r.metadata_only = 0;
	WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY,
	    "Main recovery loop: starting at %u/%" PRIuMAX,
	    r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset));
	WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec));
	/*
	 * Check if the database was shut down cleanly.  If not
	 * return an error if the user does not want automatic
	 * recovery.
	 */
	if (needs_rec && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR))
		WT_ERR(WT_RUN_RECOVERY);
	/*
	 * Always run recovery even if it was a clean shutdown.
	 * We can consider skipping it in the future.
	 */
	if (WT_IS_INIT_LSN(&r.ckpt_lsn))
		WT_ERR(__wt_log_scan(session, NULL,
		    WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER,
		    __txn_log_recover, &r));
	else
		WT_ERR(__wt_log_scan(session, &r.ckpt_lsn,
		    WT_LOGSCAN_RECOVER, __txn_log_recover, &r));

	conn->next_file_id = r.max_fileid;

	/*
	 * If recovery ran successfully forcibly log a checkpoint so the next
	 * open is fast and keep the metadata up to date with the checkpoint
	 * LSN and archiving.
	 */
	WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));

done:
err:	WT_TRET(__recovery_free(&r));
	__wt_free(session, config);
	WT_TRET(session->iface.close(&session->iface, NULL));

	return (ret);
}
Beispiel #13
0
/*
 * __txn_op_apply --
 *	Apply a transactional operation during recovery.
 */
static int
__txn_op_apply(
    WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
{
	WT_CURSOR *cursor, *start, *stop;
	WT_DECL_RET;
	WT_ITEM key, start_key, stop_key, value;
	WT_SESSION_IMPL *session;
	uint64_t recno, start_recno, stop_recno;
	uint32_t fileid, mode, optype, opsize;

	session = r->session;
	cursor = NULL;

	/* Peek at the size and the type. */
	WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize));
	end = *pp + opsize;

	switch (optype) {
	case WT_LOGOP_COL_PUT:
		WT_ERR(__wt_logop_col_put_unpack(session, pp, end,
		    &fileid, &recno, &value));
		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
		cursor->set_key(cursor, recno);
		__wt_cursor_set_raw_value(cursor, &value);
		WT_ERR(cursor->insert(cursor));
		break;

	case WT_LOGOP_COL_REMOVE:
		WT_ERR(__wt_logop_col_remove_unpack(session, pp, end,
		    &fileid, &recno));
		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
		cursor->set_key(cursor, recno);
		WT_ERR(cursor->remove(cursor));
		break;

	case WT_LOGOP_COL_TRUNCATE:
		WT_ERR(__wt_logop_col_truncate_unpack(session, pp, end,
		    &fileid, &start_recno, &stop_recno));
		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);

		/* Set up the cursors. */
		if (start_recno == 0) {
			start = NULL;
			stop = cursor;
		} else if (stop_recno == 0) {
			start = cursor;
			stop = NULL;
		} else {
			start = cursor;
			WT_ERR(__recovery_cursor(
			    session, r, lsnp, fileid, 1, &stop));
		}

		/* Set the keys. */
		if (start != NULL)
			start->set_key(start, start_recno);
		if (stop != NULL)
			stop->set_key(stop, stop_recno);

		WT_TRET(session->iface.truncate(&session->iface, NULL,
		    start, stop, NULL));
		/* If we opened a duplicate cursor, close it now. */
		if (stop != NULL && stop != cursor)
			WT_TRET(stop->close(stop));
		WT_ERR(ret);
		break;

	case WT_LOGOP_ROW_PUT:
		WT_ERR(__wt_logop_row_put_unpack(session, pp, end,
		    &fileid, &key, &value));
		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
		__wt_cursor_set_raw_key(cursor, &key);
		__wt_cursor_set_raw_value(cursor, &value);
		WT_ERR(cursor->insert(cursor));
		break;

	case WT_LOGOP_ROW_REMOVE:
		WT_ERR(__wt_logop_row_remove_unpack(session, pp, end,
		    &fileid, &key));
		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
		__wt_cursor_set_raw_key(cursor, &key);
		WT_ERR(cursor->remove(cursor));
		break;

	case WT_LOGOP_ROW_TRUNCATE:
		WT_ERR(__wt_logop_row_truncate_unpack(session, pp, end,
		    &fileid, &start_key, &stop_key, &mode));
		GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
		/* Set up the cursors. */
		start = stop = NULL;
		switch (mode) {
		case TXN_TRUNC_ALL:
			/* Both cursors stay NULL. */
			break;
		case TXN_TRUNC_BOTH:
			start = cursor;
			WT_ERR(__recovery_cursor(
			    session, r, lsnp, fileid, 1, &stop));
			break;
		case TXN_TRUNC_START:
			start = cursor;
			break;
		case TXN_TRUNC_STOP:
			stop = cursor;
			break;

		WT_ILLEGAL_VALUE_ERR(session);
		}

		/* Set the keys. */
		if (start != NULL)
			__wt_cursor_set_raw_key(start, &start_key);
		if (stop != NULL)
			__wt_cursor_set_raw_key(stop, &stop_key);

		WT_TRET(session->iface.truncate(&session->iface, NULL,
		    start, stop, NULL));
		/* If we opened a duplicate cursor, close it now. */
		if (stop != NULL && stop != cursor)
			WT_TRET(stop->close(stop));
		WT_ERR(ret);
		break;

	WT_ILLEGAL_VALUE_ERR(session);
	}

	/* Reset the cursor so it doesn't block eviction. */
	if (cursor != NULL)
		WT_ERR(cursor->reset(cursor));

	r->modified = 1;

err:	if (ret != 0)
		__wt_err(session, ret, "Operation failed during recovery");
	return (ret);
}
Beispiel #14
0
/*
 * __drop_table --
 *	WT_SESSION::drop for a table.
 */
static int
__drop_table(
    WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
{
	WT_COLGROUP *colgroup;
	WT_DECL_RET;
	WT_INDEX *idx;
	WT_TABLE *table;
	u_int i;
	const char *name;
	bool tracked;

	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE));

	name = uri;
	WT_PREFIX_SKIP_REQUIRED(session, name, "table:");

	table = NULL;
	tracked = false;

	/*
	 * Open the table so we can drop its column groups and indexes.
	 *
	 * Ideally we would keep the table locked exclusive across the drop,
	 * but for now we rely on the global table lock to prevent the table
	 * being reopened while it is being dropped.  One issue is that the
	 * WT_WITHOUT_LOCKS macro can drop and reacquire the global table lock,
	 * avoiding deadlocks while waiting for LSM operation to quiesce.
	 *
	 * Temporarily getting the table exclusively serves the purpose
	 * of ensuring that cursors on the table that are already open
	 * must at least be closed before this call proceeds.
	 */
	WT_ERR(__wt_schema_get_table_uri(session, uri, true,
	    WT_DHANDLE_EXCLUSIVE, &table));
	WT_ERR(__wt_schema_release_table(session, table));
	WT_ERR(__wt_schema_get_table_uri(session, uri, true, 0, &table));

	/* Drop the column groups. */
	for (i = 0; i < WT_COLGROUPS(table); i++) {
		if ((colgroup = table->cgroups[i]) == NULL)
			continue;
		/*
		 * Drop the column group before updating the metadata to avoid
		 * the metadata for the table becoming inconsistent if we can't
		 * get exclusive access.
		 */
		WT_ERR(__wt_schema_drop(session, colgroup->source, cfg));
		WT_ERR(__wt_metadata_remove(session, colgroup->name));
	}

	/* Drop the indices. */
	WT_ERR(__wt_schema_open_indices(session, table));
	for (i = 0; i < table->nindices; i++) {
		if ((idx = table->indices[i]) == NULL)
			continue;
		/*
		 * Drop the index before updating the metadata to avoid
		 * the metadata for the table becoming inconsistent if we can't
		 * get exclusive access.
		 */
		WT_ERR(__wt_schema_drop(session, idx->source, cfg));
		WT_ERR(__wt_metadata_remove(session, idx->name));
	}

	/* Make sure the table data handle is closed. */
	WT_TRET(__wt_schema_release_table(session, table));
	WT_ERR(__wt_schema_get_table_uri(
	    session, uri, true, WT_DHANDLE_EXCLUSIVE, &table));
	F_SET(&table->iface, WT_DHANDLE_DISCARD);
	if (WT_META_TRACKING(session)) {
		WT_WITH_DHANDLE(session, &table->iface,
		    ret = __wt_meta_track_handle_lock(session, false));
		WT_ERR(ret);
		tracked = true;
	}

	/* Remove the metadata entry (ignore missing items). */
	WT_ERR(__wt_metadata_remove(session, uri));

err:	if (table != NULL && !tracked)
		WT_TRET(__wt_schema_release_table(session, table));
	return (ret);
}
Beispiel #15
0
/*
 * __page_read --
 *	Read a page from the file.
 */
static int
__page_read(WT_SESSION_IMPL *session, WT_REF *ref)
{
    const WT_PAGE_HEADER *dsk;
    WT_BTREE *btree;
    WT_DECL_RET;
    WT_ITEM tmp;
    WT_PAGE *page;
    size_t addr_size;
    uint32_t previous_state;
    const uint8_t *addr;

    btree = S2BT(session);
    page = NULL;

    /*
     * Don't pass an allocated buffer to the underlying block read function,
     * force allocation of new memory of the appropriate size.
     */
    WT_CLEAR(tmp);

    /*
     * Attempt to set the state to WT_REF_READING for normal reads, or
     * WT_REF_LOCKED, for deleted pages.  If successful, we've won the
     * race, read the page.
     */
    if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING))
        previous_state = WT_REF_DISK;
    else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
        previous_state = WT_REF_DELETED;
    else
        return (0);

    /*
     * Get the address: if there is no address, the page was deleted, but a
     * subsequent search or insert is forcing re-creation of the name space.
     */
    WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
    if (addr == NULL) {
        WT_ASSERT(session, previous_state == WT_REF_DELETED);

        WT_ERR(__wt_btree_new_leaf_page(session, &page));
        ref->page = page;
        goto done;
    }

    /*
     * There's an address, read or map the backing disk page and build an
     * in-memory version of the page.
     */
    WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
    WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize,
                           WT_DATA_IN_ITEM(&tmp) ?
                           WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));

    /*
     * Clear the local reference to an allocated copy of the disk image on
     * return; the page steals it, errors in this code should not free it.
     */
    tmp.mem = NULL;

    /*
     * If reading for a checkpoint, there's no additional work to do, the
     * page on disk is correct as written.
     */
    if (session->dhandle->checkpoint != NULL)
        goto done;

    /* If the page was deleted, instantiate that information. */
    if (previous_state == WT_REF_DELETED)
        WT_ERR(__wt_delete_page_instantiate(session, ref));

    /*
     * Instantiate updates from the database's lookaside table. The page
     * flag was set when the page was written, potentially a long time ago.
     * We only care if the lookaside table is currently active, check that
     * before doing any work.
     */
    dsk = tmp.data;
    if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) {
        WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside);
        WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside);

        WT_ERR(__las_page_instantiate(
                   session, ref, btree->id, addr, addr_size));
    }

done:
    WT_PUBLISH(ref->state, WT_REF_MEM);
    return (0);

err:	/*
	 * If the function building an in-memory version of the page failed,
	 * it discarded the page, but not the disk image.  Discard the page
	 * and separately discard the disk image in all cases.
	 */
    if (ref->page != NULL)
        __wt_ref_out(session, ref);
    WT_PUBLISH(ref->state, previous_state);

    __wt_buf_free(session, &tmp);

    return (ret);
}
Beispiel #16
0
/*
 * __wt_lsm_merge_worker --
 *	The merge worker thread for an LSM tree, responsible for merging
 *	on-disk trees.
 */
void *
__wt_lsm_merge_worker(void *vargs)
{
	WT_DECL_RET;
	WT_LSM_WORKER_ARGS *args;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	u_int aggressive, chunk_wait, id, old_aggressive, stallms;
	int progress;

	args = vargs;
	lsm_tree = args->lsm_tree;
	id = args->id;
	session = lsm_tree->worker_sessions[id];
	__wt_free(session, args);

	aggressive = stallms = 0;

	while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) {
		/*
		 * Help out with switching chunks in case the checkpoint worker
		 * is busy.
		 */
		if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
			WT_WITH_SCHEMA_LOCK(session, ret =
			    __wt_lsm_tree_switch(session, lsm_tree));
			WT_ERR(ret);
		}

		progress = 0;

		/* Clear any state from previous worker thread iterations. */
		session->dhandle = NULL;

		/* Try to create a Bloom filter. */
		if (__lsm_bloom_work(session, lsm_tree) == 0)
			progress = 1;

		/* If we didn't create a Bloom filter, try to merge. */
		if (progress == 0 &&
		    __wt_lsm_merge(session, lsm_tree, id, aggressive) == 0)
			progress = 1;

		/* Clear any state from previous worker thread iterations. */
		WT_CLEAR_BTREE_IN_SESSION(session);

		/*
		 * Only have one thread freeing old chunks, and only if there
		 * are chunks to free.
		 */
		if (id == 0 && lsm_tree->nold_chunks > 0 &&
		    __lsm_free_chunks(session, lsm_tree) == 0)
			progress = 1;

		if (progress)
			stallms = 0;
		else if (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) &&
		    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
			/* Poll 10 times per second. */
			WT_ERR_TIMEDOUT_OK(__wt_cond_wait(
			    session, lsm_tree->work_cond, 100000));
			stallms += 100;

			/*
			 * Get aggressive if more than enough chunks for a
			 * merge should have been created while we waited.
			 * Use 10 seconds as a default if we don't have an
			 * estimate.
			 */
			chunk_wait = stallms / (lsm_tree->chunk_fill_ms == 0 ?
			    10000 : lsm_tree->chunk_fill_ms);
			old_aggressive = aggressive;
			aggressive = chunk_wait / lsm_tree->merge_min;

			if (aggressive > old_aggressive)
				WT_VERBOSE_ERR(session, lsm,
				     "LSM merge got aggressive (%u), "
				     "%u / %" PRIu64,
				     aggressive, stallms,
				     lsm_tree->chunk_fill_ms);
		}
	}

	if (0) {
err:		__wt_err(session, ret, "LSM merge worker failed");
	}

	return (NULL);
}
Beispiel #17
0
/*
 * __wt_open --
 *	Open a file handle.
 */
int
__wt_open(WT_SESSION_IMPL *session,
          const char *name, uint32_t file_type, uint32_t flags, WT_FH **fhp)
{
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    WT_FH *fh;
    bool lock_file, open_called;
    char *path;

    WT_ASSERT(session, file_type != 0);	/* A file type is required. */

    conn = S2C(session);
    fh = NULL;
    open_called = false;
    path = NULL;

    WT_RET(__open_verbose(session, name, file_type, flags));

    /* Check if the handle is already open. */
    if (__wt_handle_search(session, name, true, NULL, &fh)) {
        /*
         * XXX
         * The in-memory implementation has to reset the file offset
         * when a file is re-opened (which obviously also depends on
         * in-memory configurations never opening a file in more than
         * one thread at a time). This needs to be fixed.
         */
        if (F_ISSET(fh, WT_FH_IN_MEMORY) && fh->ref == 1)
            fh->off = 0;
        *fhp = fh;
        return (0);
    }

    /* Allocate a structure and set the name. */
    WT_ERR(__wt_calloc_one(session, &fh));
    WT_ERR(__wt_strdup(session, name, &fh->name));

    /*
     * If this is a read-only connection, open all files read-only except
     * the lock file.
     *
     * The only file created in read-only mode is the lock file.
     */
    if (F_ISSET(conn, WT_CONN_READONLY)) {
        lock_file = strcmp(name, WT_SINGLETHREAD) == 0;
        if (!lock_file)
            LF_SET(WT_OPEN_READONLY);
        WT_ASSERT(session, lock_file || !LF_ISSET(WT_OPEN_CREATE));
    }

    /* Create the path to the file. */
    if (!LF_ISSET(WT_OPEN_FIXED))
        WT_ERR(__wt_filename(session, name, &path));

    /* Call the underlying open function. */
    WT_ERR(conn->handle_open(
               session, fh, path == NULL ? name : path, file_type, flags));
    open_called = true;

    /*
     * Repeat the check for a match: if there's no match, link our newly
     * created handle onto the database's list of files.
     */
    if (__wt_handle_search(session, name, true, fh, fhp)) {
err:
        if (open_called)
            WT_TRET(fh->fh_close(session, fh));
        if (fh != NULL) {
            __wt_free(session, fh->name);
            __wt_free(session, fh);
        }
    }

    __wt_free(session, path);
    return (ret);
}
Beispiel #18
0
/*
 * __snapshot_process --
 *	Process the list of snapshots.
 */
static int
__snapshot_process(
    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase)
{
	WT_BLOCK_SNAPSHOT *a, *b, *si;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_SNAPSHOT *snap;
	uint64_t snapshot_size;
	int deleting, locked;

	si = &block->live;
	locked = 0;

	/*
	 * We've allocated our last page, update the snapshot size.  We need to
	 * calculate the live system's snapshot size before reading and merging
	 * snapshot allocation and discard information from the snapshots we're
	 * deleting, those operations will change the underlying byte counts.
	 */
	snapshot_size = si->snapshot_size;
	snapshot_size += si->alloc.bytes;
	snapshot_size -= si->discard.bytes;

	/*
	 * Extents that become newly available as a result of deleting previous
	 * snapshots are added to a list of extents.  The list should be empty,
	 * but there's no explicit "free the snapshot information" call into the
	 * block manager; if there was an error in an upper level resulting in
	 * the snapshot never being "resolved", the list might not be empty.
	 *
	 * XXX
	 * This isn't sufficient, actually: we're going to leak all the blocks
	 * that were written as part of the last snapshot because it was never
	 * resolved.
	 */
	__wt_block_extlist_free(session, &si->snapshot_avail);
	WT_RET(__wt_block_extlist_init(
	    session, &si->snapshot_avail, "live", "snapshot_avail"));

	/*
	 * To delete a snapshot, we'll need snapshot information for it, and we
	 * have to read that from the disk.
	 */
	deleting = 0;
	WT_SNAPSHOT_FOREACH(snapbase, snap) {
		/*
		 * To delete a snapshot, we'll need snapshot information for it
		 * and the subsequent snapshot.  The test is tricky, we have to
		 * load the current snapshot's information if it's marked for
		 * deletion, or if it follows a snapshot marked for deletion,
		 * where the boundary cases are the first snapshot in the list
		 * and the last snapshot in the list: if we're deleting the last
		 * snapshot in the list, there's no next snapshot, the snapshot
		 * will be merged into the live tree.
		 */
		if (!F_ISSET(snap, WT_SNAP_DELETE) &&
		    (snap == snapbase ||
		    F_ISSET(snap, WT_SNAP_ADD) ||
		    !F_ISSET(snap - 1, WT_SNAP_DELETE)))
			continue;
		deleting = 1;

		/*
		 * Allocate a snapshot structure, crack the cookie and read the
		 * snapshot's extent lists.
		 *
		 * Ignore the avail list: snapshot avail lists are only useful
		 * if we are rolling forward from the particular snapshot and
		 * they represent our best understanding of what blocks can be
		 * allocated.  If we are not operating on the live snapshot,
		 * subsequent snapshots might have allocated those blocks, and
		 * the avail list is useless.  We don't discard it, because it
		 * is useful as part of verification, but we don't re-write it
		 * either.
		 */
		WT_ERR(__wt_calloc(
		    session, 1, sizeof(WT_BLOCK_SNAPSHOT), &snap->bpriv));
		si = snap->bpriv;
		WT_ERR(__wt_block_snap_init(session, block, si, snap->name, 0));
		WT_ERR(__wt_block_buffer_to_snapshot(
		    session, block, snap->raw.data, si));
		WT_ERR(__wt_block_extlist_read(session, block, &si->alloc));
		WT_ERR(__wt_block_extlist_read(session, block, &si->discard));
	}

	/*
	 * Hold a lock so the live extent lists and the file size can't change
	 * underneath us.  I suspect we'll tighten this if snapshots take too
	 * much time away from real work: we read historic snapshot information
	 * without a lock, but we could also merge and re-write the delete
	 * snapshot information without a lock, except for ranges merged into
	 * the live tree.
	 */
	__wt_spin_lock(session, &block->live_lock);
	locked = 1;

	/* Skip the additional processing if we aren't deleting snapshots. */
	if (!deleting)
		goto live_update;

	/*
	 * Delete any no-longer-needed snapshots: we do this first as it frees
	 * blocks to the live lists, and the freed blocks will then be included
	 * when writing the live extent lists.
	 */
	WT_SNAPSHOT_FOREACH(snapbase, snap) {
		if (!F_ISSET(snap, WT_SNAP_DELETE))
			continue;

		if (WT_VERBOSE_ISSET(session, snapshot)) {
			if (tmp == NULL)
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__snapshot_string(
			    session, block, snap->raw.data, tmp));
			WT_VERBOSE_ERR(session, snapshot,
			    "%s: delete-snapshot: %s: %s",
			    block->name, snap->name, (char *)tmp->data);
		}

		/*
		 * Set the from/to snapshot structures, where the "to" value
		 * may be the live tree.
		 */
		a = snap->bpriv;
		if (F_ISSET(snap + 1, WT_SNAP_ADD))
			b = &block->live;
		else
			b = (snap + 1)->bpriv;

		/*
		 * Free the root page: there's nothing special about this free,
		 * the root page is allocated using normal rules, that is, it
		 * may have been taken from the avail list, and was entered on
		 * the live system's alloc list at that time.  We free it into
		 * the snapshot's discard list, however, not the live system's
		 * list because it appears on the snapshot's alloc list and so
		 * must be paired in the snapshot.
		 */
		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
			WT_ERR(__wt_block_insert_ext(session,
			    &a->discard, a->root_offset, a->root_size));

		/*
		 * Free the blocks used to hold the "from" snapshot's extent
		 * lists directly to the live system's avail list, they were
		 * never on any alloc list.   Include the "from" snapshot's
		 * avail list, it's going away.
		 */
		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->alloc));
		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->avail));
		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->discard));

		/*
		 * Roll the "from" alloc and discard extent lists into the "to"
		 * snapshot's lists.
		 */
		if (a->alloc.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, &a->alloc, &b->alloc));
		if (a->discard.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, &a->discard, &b->discard));

		/*
		 * If the "to" snapshot is also being deleted, we're done with
		 * it, it's merged into some other snapshot in the next loop.
		 * This means the extent lists may aggregate over a number of
		 * snapshots, but that's OK, they're disjoint sets of ranges.
		 */
		if (F_ISSET(snap + 1, WT_SNAP_DELETE))
			continue;

		/*
		 * Find blocks for re-use: wherever the "to" snapshot's allocate
		 * and discard lists overlap is fair game, move ranges appearing
		 * on both lists to the live snapshot's newly available list.
		 */
		WT_ERR(__wt_block_extlist_overlap(session, block, b));

		/*
		 * If we're updating the live system's information, we're done.
		 */
		if (F_ISSET(snap + 1, WT_SNAP_ADD))
			continue;

		/*
		 * We have to write the "to" snapshot's extent lists out in new
		 * blocks, and update its cookie.
		 *
		 * Free the blocks used to hold the "to" snapshot's extent lists
		 * directly to the live system's avail list, they were never on
		 * any alloc list.  Do not include the "to" snapshot's avail
		 * list, it's not changing.
		 */
		WT_ERR(__snapshot_extlist_fblocks(session, block, &b->alloc));
		WT_ERR(__snapshot_extlist_fblocks(session, block, &b->discard));

		F_SET(snap + 1, WT_SNAP_UPDATE);
	}

	/* Update snapshots marked for update. */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if (F_ISSET(snap, WT_SNAP_UPDATE)) {
			WT_ASSERT(session, !F_ISSET(snap, WT_SNAP_ADD));
			WT_ERR(__snapshot_update(
			    session, block, snap, snap->bpriv, 0, 0));
		}

live_update:
	si = &block->live;

	/* Truncate the file if that's possible. */
	WT_ERR(__wt_block_extlist_truncate(session, block, &si->avail));

	/* Update the final, added snapshot based on the live system. */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if (F_ISSET(snap, WT_SNAP_ADD)) {
			WT_ERR(__snapshot_update(
			    session, block, snap, si, snapshot_size, 1));

			/*
			 * XXX
			 * Our caller wants two pieces of information: the time
			 * the snapshot was taken and the final snapshot size.
			 * This violates layering but the alternative is a call
			 * for the btree layer to crack the snapshot cookie into
			 * its components, and that's a fair amount of work.
			 * (We could just read the system time in the session
			 * layer when updating the metadata file, but that won't
			 * work for the snapshot size, and so we do both here.)
			 */
			snap->snapshot_size = si->snapshot_size;
			WT_ERR(__wt_epoch(session, &snap->sec, NULL));
		}

	/*
	 * Reset the live system's alloc and discard extent lists, leave the
	 * avail list alone.
	 */
	__wt_block_extlist_free(session, &si->alloc);
	WT_ERR(__wt_block_extlist_init(session, &si->alloc, "live", "alloc"));
	__wt_block_extlist_free(session, &si->discard);
	WT_ERR(
	    __wt_block_extlist_init(session, &si->discard, "live", "discard"));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * The first snapshot in the system should always have an empty discard
	 * list.  If we've read that snapshot and/or created it, check.
	 */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if (!F_ISSET(snap, WT_SNAP_DELETE))
			break;
	if ((a = snap->bpriv) == NULL)
		a = &block->live;
	if (a->discard.entries != 0) {
		__wt_errx(session,
		    "snapshot incorrectly has blocks on the discard list");
		WT_ERR(WT_ERROR);
	}
#endif

err:	if (locked)
		__wt_spin_unlock(session, &block->live_lock);

	/* Discard any snapshot information we loaded, we no longer need it. */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if ((si = snap->bpriv) != NULL) {
			__wt_block_extlist_free(session, &si->alloc);
			__wt_block_extlist_free(session, &si->avail);
			__wt_block_extlist_free(session, &si->discard);
		}

	__wt_scr_free(&tmp);
	return (ret);
}
Beispiel #19
0
/*
 * __create_colgroup --
 *	Create a column group.
 */
static int
__create_colgroup(WT_SESSION_IMPL *session,
    const char *name, int exclusive, const char *config)
{
	WT_CONFIG_ITEM cval;
	WT_DECL_RET;
	WT_ITEM confbuf, fmt, namebuf;
	WT_TABLE *table;
	size_t tlen;
	const char **cfgp, *cfg[4] =
	    { WT_CONFIG_BASE(session, colgroup_meta), config, NULL, NULL };
	const char *sourcecfg[] = { config, NULL, NULL };
	const char *cgname, *source, *sourceconf, *tablename;
	char *cgconf, *oldconf;

	sourceconf = NULL;
	cgconf = oldconf = NULL;
	WT_CLEAR(fmt);
	WT_CLEAR(confbuf);
	WT_CLEAR(namebuf);

	tablename = name;
	if (!WT_PREFIX_SKIP(tablename, "colgroup:"))
		return (EINVAL);
	cgname = strchr(tablename, ':');
	if (cgname != NULL) {
		tlen = (size_t)(cgname - tablename);
		++cgname;
	} else
		tlen = strlen(tablename);

	if ((ret =
	    __wt_schema_get_table(session, tablename, tlen, 1, &table)) != 0)
		WT_RET_MSG(session, (ret == WT_NOTFOUND) ? ENOENT : ret,
		    "Can't create '%s' for non-existent table '%.*s'",
		    name, (int)tlen, tablename);

	/* Make sure the column group is referenced from the table. */
	if (cgname != NULL && (ret =
	    __wt_config_subgets(session, &table->cgconf, cgname, &cval)) != 0)
		WT_ERR_MSG(session, EINVAL,
		    "Column group '%s' not found in table '%.*s'",
		    cgname, (int)tlen, tablename);

	/* Find the first NULL entry in the cfg stack. */
	for (cfgp = &cfg[1]; *cfgp; cfgp++)
		;

	/* Add the source to the colgroup config before collapsing. */
	if (__wt_config_getones(
	    session, config, "source", &cval) == 0 && cval.len != 0) {
		WT_ERR(__wt_buf_fmt(
		    session, &namebuf, "%.*s", (int)cval.len, cval.str));
		source = namebuf.data;
	} else {
		WT_ERR(__wt_schema_colgroup_source(
		    session, table, cgname, config, &namebuf));
		source = namebuf.data;
		WT_ERR(__wt_buf_fmt(
		    session, &confbuf, "source=\"%s\"", source));
		*cfgp++ = confbuf.data;
	}

	/* Calculate the key/value formats: these go into the source config. */
	WT_ERR(__wt_buf_fmt(session, &fmt, "key_format=%s", table->key_format));
	if (cgname == NULL)
		WT_ERR(__wt_buf_catfmt
		    (session, &fmt, ",value_format=%s", table->value_format));
	else {
		if (__wt_config_getones(session, config, "columns", &cval) != 0)
			WT_ERR_MSG(session, EINVAL,
			    "No 'columns' configuration for '%s'", name);
		WT_ERR(__wt_buf_catfmt(session, &fmt, ",value_format="));
		WT_ERR(__wt_struct_reformat(session,
		    table, cval.str, cval.len, NULL, 1, &fmt));
	}
	sourcecfg[1] = fmt.data;
	WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf));

	WT_ERR(__wt_schema_create(session, source, sourceconf));

	WT_ERR(__wt_config_collapse(session, cfg, &cgconf));
	if ((ret = __wt_metadata_insert(session, name, cgconf)) != 0) {
		/*
		 * If the entry already exists in the metadata, we're done.
		 * This is an error for exclusive creates but okay otherwise.
		 */
		if (ret == WT_DUPLICATE_KEY)
			ret = exclusive ? EEXIST : 0;
		goto err;
	}

	WT_ERR(__wt_schema_open_colgroups(session, table));

err:	__wt_free(session, cgconf);
	__wt_free(session, sourceconf);
	__wt_free(session, oldconf);
	__wt_buf_free(session, &confbuf);
	__wt_buf_free(session, &fmt);
	__wt_buf_free(session, &namebuf);

	__wt_schema_release_table(session, table);
	return (ret);
}
Beispiel #20
0
/*
 * __snapshot_update --
 *	Update a snapshot.
 */
static int
__snapshot_update(
    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snap,
    WT_BLOCK_SNAPSHOT *si, uint64_t snapshot_size, int is_live)
{
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint8_t *endp;

#ifdef HAVE_DIAGNOSTIC
	/* Check the extent list combinations for overlaps. */
	WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->avail));
	WT_RET(__wt_block_extlist_check(session, &si->discard, &si->avail));
	WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->discard));
#endif
	/*
	 * Write the snapshot's extent lists; we only write an avail list for
	 * the live system, other snapshot's avail lists are static and never
	 * change.  When we do write the avail list for the live system it's
	 * two lists: the current avail list plus the list of blocks that are
	 * being made available as of the new snapshot.  We can't merge that
	 * second list into the real list yet, it's not truly available until
	 * the new snapshot location has been saved to the metadata.
	 */
	WT_RET(__wt_block_extlist_write(session, block, &si->alloc, NULL));
	if (is_live)
		WT_RET(__wt_block_extlist_write(
		    session, block, &si->avail, &si->snapshot_avail));
	WT_RET(__wt_block_extlist_write(session, block, &si->discard, NULL));

	/*
	 * Set the file size for the live system.
	 *
	 * XXX
	 * We do NOT set the file size when re-writing snapshots because we want
	 * to test the snapshot's blocks against a reasonable maximum file size
	 * during verification.  This is not good: imagine a snapshot appearing
	 * early in the file, re-written, and then the snapshot requires blocks
	 * at the end of the file, blocks after the listed file size.  If the
	 * application opens that snapshot for writing (discarding subsequent
	 * snapshots), we would truncate the file to the early chunk, discarding
	 * the re-written snapshot information.  The alternative, updating the
	 * file size has its own problems, in that case we'd work correctly, but
	 * we'd lose all of the blocks between the original snapshot and the
	 * re-written snapshot.  Currently, there's no API to roll-forward
	 * intermediate snapshots, if there ever is, this will need to be fixed.
	 */
	if (is_live)
		WT_RET(__wt_filesize(session, block->fh, &si->file_size));

	/* Set the snapshot size for the live system. */
	if (is_live)
		si->snapshot_size = snapshot_size;

	/*
	 * Copy the snapshot information into the snapshot array's address
	 * cookie.
	 */
	WT_RET(__wt_buf_init(session, &snap->raw, WT_BTREE_MAX_ADDR_COOKIE));
	endp = snap->raw.mem;
	WT_RET(__wt_block_snapshot_to_buffer(session, block, &endp, si));
	snap->raw.size = WT_PTRDIFF32(endp, snap->raw.mem);

	if (WT_VERBOSE_ISSET(session, snapshot)) {
		WT_RET(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(__snapshot_string(session, block, snap->raw.data, tmp));
		WT_VERBOSE_ERR(session, snapshot,
		    "%s: create-snapshot: %s: %s",
		    block->name, snap->name, (char *)tmp->data);
	}

err:	__wt_scr_free(&tmp);
	return (ret);
}
Beispiel #21
0
/*
 * __create_table --
 *	Create a table.
 */
static int
__create_table(WT_SESSION_IMPL *session,
    const char *name, int exclusive, const char *config)
{
	WT_CONFIG conf;
	WT_CONFIG_ITEM cgkey, cgval, cval;
	WT_DECL_RET;
	WT_TABLE *table;
	const char *cfg[4] =
	    { WT_CONFIG_BASE(session, table_meta), config, NULL, NULL };
	const char *tablename;
	char *tableconf, *cgname;
	size_t cgsize;
	int ncolgroups;

	cgname = NULL;
	table = NULL;
	tableconf = NULL;

	tablename = name;
	if (!WT_PREFIX_SKIP(tablename, "table:"))
		return (EINVAL);

	if ((ret = __wt_schema_get_table(session,
	    tablename, strlen(tablename), 0, &table)) == 0) {
		__wt_schema_release_table(session, table);
		return (exclusive ? EEXIST : 0);
	}
	WT_RET_NOTFOUND_OK(ret);

	WT_ERR(__wt_config_gets(session, cfg, "colgroups", &cval));
	WT_ERR(__wt_config_subinit(session, &conf, &cval));
	for (ncolgroups = 0;
	    (ret = __wt_config_next(&conf, &cgkey, &cgval)) == 0;
	    ncolgroups++)
		;
	WT_ERR_NOTFOUND_OK(ret);

	WT_ERR(__wt_config_collapse(session, cfg, &tableconf));
	if ((ret = __wt_metadata_insert(session, name, tableconf)) != 0) {
		/*
		 * If the entry already exists in the metadata, we're done.
		 * This is an error for exclusive creates but okay otherwise.
		 */
		if (ret == WT_DUPLICATE_KEY)
			ret = exclusive ? EEXIST : 0;
		goto err;
	}

	/* Attempt to open the table now to catch any errors. */
	WT_ERR(__wt_schema_get_table(
	    session, tablename, strlen(tablename), 1, &table));

	if (ncolgroups == 0) {
		cgsize = strlen("colgroup:") + strlen(tablename) + 1;
		WT_ERR(__wt_calloc_def(session, cgsize, &cgname));
		snprintf(cgname, cgsize, "colgroup:%s", tablename);
		WT_ERR(__create_colgroup(session, cgname, exclusive, config));
	}

	if (0) {
err:		if (table != NULL) {
			WT_TRET(__wt_schema_remove_table(session, table));
			table = NULL;
		}
	}
	if (table != NULL)
		__wt_schema_release_table(session, table);
	__wt_free(session, cgname);
	__wt_free(session, tableconf);
	return (ret);
}
Beispiel #22
0
/*
 * __wt_block_snapshot_load --
 *	Load a snapshot.
 */
int
__wt_block_snapshot_load(WT_SESSION_IMPL *session,
    WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size,
    int readonly)
{
	WT_BLOCK_SNAPSHOT *si;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;

	WT_UNUSED(addr_size);

	/*
	 * Sometimes we don't find a root page (we weren't given a snapshot,
	 * or the referenced snapshot was empty).  In that case we return a
	 * root page size of 0.  Set that up now.
	 */
	dsk->size = 0;

	si = &block->live;
	WT_RET(__wt_block_snap_init(session, block, si, "live", 1));

	if (WT_VERBOSE_ISSET(session, snapshot)) {
		if (addr != NULL) {
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__snapshot_string(session, block, addr, tmp));
		}
		WT_VERBOSE_ERR(session, snapshot,
		    "%s: load-snapshot: %s", block->name,
		    addr == NULL ? "[Empty]" : (char *)tmp->data);
	}

	/* If not loading a snapshot from disk, we're done. */
	if (addr == NULL || addr_size == 0)
		return (0);

	/* Crack the snapshot cookie. */
	if (addr != NULL)
		WT_ERR(__wt_block_buffer_to_snapshot(session, block, addr, si));

	/* Verify sets up next. */
	if (block->verify)
		WT_ERR(__wt_verify_snap_load(session, block, si));

	/* Read, and optionally verify, any root page. */
	if (si->root_offset != WT_BLOCK_INVALID_OFFSET) {
		WT_ERR(__wt_block_read_off(session, block,
		    dsk, si->root_offset, si->root_size, si->root_cksum));
		if (block->verify) {
			if (tmp == NULL) {
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
				WT_ERR(__snapshot_string(
				    session, block, addr, tmp));
			}
			WT_ERR(
			    __wt_verify_dsk(session, (char *)tmp->data, dsk));
		}
	}

	/*
	 * Rolling a snapshot forward requires the avail list, the blocks from
	 * which we can allocate.
	 */
	if (!readonly)
		WT_ERR(__wt_block_extlist_read(session, block, &si->avail));

	/*
	 * If the snapshot can be written, that means anything written after
	 * the snapshot is no longer interesting.  Truncate the file.
	 */
	if (!readonly) {
		WT_VERBOSE_ERR(session, snapshot,
		    "truncate file to %" PRIuMAX, (uintmax_t)si->file_size);
		WT_ERR(__wt_ftruncate(session, block->fh, si->file_size));
	}

	if (0) {
err:		(void)__wt_block_snapshot_unload(session, block);
	}

	__wt_scr_free(&tmp);
	return (ret);
}
Beispiel #23
0
/*
 * __wt_tree_walk --
 *	Move to the next/previous page in the tree.
 */
int
__wt_tree_walk(WT_SESSION_IMPL *session,
    WT_REF **refp, uint64_t *walkcntp, uint32_t flags)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_INDEX *pindex;
	WT_REF *couple, *couple_orig, *ref;
	bool empty_internal, prev, skip;
	uint32_t slot;

	btree = S2BT(session);
	empty_internal = false;

	/*
	 * Tree walks are special: they look inside page structures that splits
	 * may want to free.  Publish that the tree is active during this
	 * window.
	 */
	WT_ENTER_PAGE_INDEX(session);

	/* Walk should never instantiate deleted pages. */
	LF_SET(WT_READ_NO_EMPTY);

	/*
	 * !!!
	 * Fast-truncate currently only works on row-store trees.
	 */
	if (btree->type != BTREE_ROW)
		LF_CLR(WT_READ_TRUNCATE);

	prev = LF_ISSET(WT_READ_PREV) ? 1 : 0;

	/*
	 * There are multiple reasons and approaches to walking the in-memory
	 * tree:
	 *
	 * (1) finding pages to evict (the eviction server);
	 * (2) writing just dirty leaves or internal nodes (checkpoint);
	 * (3) discarding pages (close);
	 * (4) truncating pages in a range (fast truncate);
	 * (5) skipping pages based on outside information (compaction);
	 * (6) cursor scans (applications).
	 *
	 * Except for cursor scans and compaction, the walk is limited to the
	 * cache, no pages are read.  In all cases, hazard pointers protect the
	 * walked pages from eviction.
	 *
	 * Walks use hazard-pointer coupling through the tree and that's OK
	 * (hazard pointers can't deadlock, so there's none of the usual
	 * problems found when logically locking up a btree).  If the eviction
	 * thread tries to evict the active page, it fails because of our
	 * hazard pointer.  If eviction tries to evict our parent, that fails
	 * because the parent has a child page that can't be discarded.  We do
	 * play one game: don't couple up to our parent and then back down to a
	 * new leaf, couple to the next page to which we're descending, it
	 * saves a hazard-pointer swap for each cursor page movement.
	 *
	 * !!!
	 * NOTE: we depend on the fact it's OK to release a page we don't hold,
	 * that is, it's OK to release couple when couple is set to NULL.
	 *
	 * Take a copy of any held page and clear the return value.  Remember
	 * the hazard pointer we're currently holding.
	 *
	 * We may be passed a pointer to btree->evict_page that we are clearing
	 * here.  We check when discarding pages that we're not discarding that
	 * page, so this clear must be done before the page is released.
	 */
	couple = couple_orig = ref = *refp;
	*refp = NULL;

	/* If no page is active, begin a walk from the start of the tree. */
	if (ref == NULL) {
		ref = &btree->root;
		if (ref->page == NULL)
			goto done;
		goto descend;
	}

ascend:	/*
	 * If the active page was the root, we've reached the walk's end.
	 * Release any hazard-pointer we're holding.
	 */
	if (__wt_ref_is_root(ref)) {
		WT_ERR(__wt_page_release(session, couple, flags));
		goto done;
	}

	/* Figure out the current slot in the WT_REF array. */
	__page_refp(session, ref, &pindex, &slot);

	for (;;) {
		/*
		 * If we're at the last/first slot on the page, return this page
		 * in post-order traversal.  Otherwise we move to the next/prev
		 * slot and left/right-most element in its subtree.
		 */
		if ((prev && slot == 0) ||
		    (!prev && slot == pindex->entries - 1)) {
			ref = ref->home->pg_intl_parent_ref;

			/*
			 * If we got all the way through an internal page and
			 * all of the child pages were deleted, mark it for
			 * eviction.
			 */
			if (empty_internal && pindex->entries > 1) {
				__wt_page_evict_soon(ref->page);
				empty_internal = false;
			}

			/* Optionally skip internal pages. */
			if (LF_ISSET(WT_READ_SKIP_INTL))
				goto ascend;

			/*
			 * We've ascended the tree and are returning an internal
			 * page.  If it's the root, discard our hazard pointer,
			 * otherwise, swap our hazard pointer for the page we'll
			 * return.
			 */
			if (__wt_ref_is_root(ref))
				WT_ERR(__wt_page_release(
				    session, couple, flags));
			else {
				/*
				 * Locate the reference to our parent page then
				 * swap our child hazard pointer for the parent.
				 * We don't handle restart or not-found returns.
				 * It would require additional complexity and is
				 * not a possible return: we're moving to the
				 * parent of the current child page, our parent
				 * reference can't have split or been evicted.
				 */
				__page_refp(session, ref, &pindex, &slot);
				if ((ret = __wt_page_swap(
				    session, couple, ref, flags)) != 0) {
					WT_TRET(__wt_page_release(
					    session, couple, flags));
					WT_ERR(ret);
				}
			}

			*refp = ref;
			goto done;
		}

		if (prev)
			--slot;
		else
			++slot;

		if (walkcntp != NULL)
			++*walkcntp;

		for (;;) {
			/*
			 * Move to the next slot, and set the reference hint if
			 * it's wrong (used when we continue the walk). We don't
			 * update those hints when splitting, so it's common for
			 * them to be incorrect in some workloads.
			 */
			ref = pindex->index[slot];
			if (ref->pindex_hint != slot)
				ref->pindex_hint = slot;

			/*
			 * If we see any child states other than deleted, the
			 * page isn't empty.
			 */
			if (ref->state != WT_REF_DELETED &&
			    !LF_ISSET(WT_READ_TRUNCATE))
				empty_internal = false;

			if (LF_ISSET(WT_READ_CACHE)) {
				/*
				 * Only look at unlocked pages in memory:
				 * fast-path some common cases.
				 */
				if (LF_ISSET(WT_READ_NO_WAIT) &&
				    ref->state != WT_REF_MEM)
					break;
			} else if (LF_ISSET(WT_READ_TRUNCATE)) {
				/*
				 * Avoid pulling a deleted page back in to try
				 * to delete it again.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref, false))
					break;
				/*
				 * If deleting a range, try to delete the page
				 * without instantiating it.
				 */
				WT_ERR(__wt_delete_page(session, ref, &skip));
				if (skip)
					break;
				empty_internal = false;
			} else if (LF_ISSET(WT_READ_COMPACT)) {
				/*
				 * Skip deleted pages, rewriting them doesn't
				 * seem useful.
				 */
				if (ref->state == WT_REF_DELETED)
					break;

				/*
				 * If the page is in-memory, we want to look at
				 * it (it may have been modified and written,
				 * and the current location is the interesting
				 * one in terms of compaction, not the original
				 * location).  If the page isn't in-memory, test
				 * if the page will help with compaction, don't
				 * read it if we don't have to.
				 */
				if (ref->state == WT_REF_DISK) {
					WT_ERR(__wt_compact_page_skip(
					    session, ref, &skip));
					if (skip)
						break;
				}
			} else {
				/*
				 * Try to skip deleted pages visible to us.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref, false))
					break;
			}

			ret = __wt_page_swap(session, couple, ref, flags);

			/*
			 * Not-found is an expected return when only walking
			 * in-cache pages, or if we see a deleted page.
			 */
			if (ret == WT_NOTFOUND) {
				ret = 0;
				break;
			}

			/*
			 * The page we're moving to might have split, in which
			 * case move to the last position we held.
			 */
			if (ret == WT_RESTART) {
				ret = 0;

				/*
				 * If a new walk that never coupled from the
				 * root to a new saved position in the tree,
				 * restart the walk.
				 */
				if (couple == &btree->root) {
					ref = &btree->root;
					if (ref->page == NULL)
						goto done;
					goto descend;
				}

				/*
				 * If restarting from some original position,
				 * repeat the increment or decrement we made at
				 * that time. Otherwise, couple is an internal
				 * page we've acquired after moving from that
				 * starting position and we can treat it as a
				 * new page. This works because we never acquire
				 * a hazard pointer on a leaf page we're not
				 * going to return to our caller, this will quit
				 * working if that ever changes.
				 */
				WT_ASSERT(session,
				    couple == couple_orig ||
				    WT_PAGE_IS_INTERNAL(couple->page));
				ref = couple;
				__page_refp(session, ref, &pindex, &slot);
				if (couple == couple_orig)
					break;
			}
			WT_ERR(ret);

			/*
			 * A new page: configure for traversal of any internal
			 * page's children, else return the leaf page.
			 */
descend:		couple = ref;
			page = ref->page;
			if (WT_PAGE_IS_INTERNAL(page)) {
				WT_INTL_INDEX_GET(session, page, pindex);
				slot = prev ? pindex->entries - 1 : 0;
				empty_internal = true;
			} else {
				*refp = ref;
				goto done;
			}
		}
	}

done:
err:	WT_LEAVE_PAGE_INDEX(session);
	return (ret);
}
Beispiel #24
0
/*
 * __wt_txn_checkpoint_log --
 *	Write a log record for a checkpoint operation.
 */
int
__wt_txn_checkpoint_log(
    WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp)
{
	WT_DECL_ITEM(logrec);
	WT_DECL_RET;
	WT_ITEM *ckpt_snapshot, empty;
	WT_LSN *ckpt_lsn;
	WT_TXN *txn;
	uint8_t *end, *p;
	size_t recsize;
	uint32_t i, rectype = WT_LOGREC_CHECKPOINT;
	const char *fmt = WT_UNCHECKED_STRING(IIQIU);

	txn = &session->txn;
	ckpt_lsn = &txn->ckpt_lsn;

	/*
	 * If this is a file sync, log it unless there is a full checkpoint in
	 * progress.
	 */
	if (!full) {
		if (txn->full_ckpt) {
			if (lsnp != NULL)
				*lsnp = *ckpt_lsn;
			return (0);
		}
		return (__txn_log_file_sync(session, flags, lsnp));
	}

	switch (flags) {
	case WT_TXN_LOG_CKPT_PREPARE:
		txn->full_ckpt = 1;
		WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true));
		/*
		 * We need to make sure that the log records in the checkpoint
		 * LSN are on disk.  In particular to make sure that the
		 * current log file exists.
		 */
		WT_ERR(__wt_log_force_sync(session, ckpt_lsn));
		break;
	case WT_TXN_LOG_CKPT_START:
		/* Take a copy of the transaction snapshot. */
		txn->ckpt_nsnapshot = txn->snapshot_count;
		recsize = txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE;
		WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot));
		p = txn->ckpt_snapshot->mem;
		end = p + recsize;
		for (i = 0; i < txn->snapshot_count; i++)
			WT_ERR(__wt_vpack_uint(
			    &p, WT_PTRDIFF(end, p), txn->snapshot[i]));
		break;
	case WT_TXN_LOG_CKPT_STOP:
		/*
		 * During a clean connection close, we get here without the
		 * prepare or start steps.  In that case, log the current LSN
		 * as the checkpoint LSN.
		 */
		if (!txn->full_ckpt) {
			txn->ckpt_nsnapshot = 0;
			WT_CLEAR(empty);
			ckpt_snapshot = &empty;
			WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true));
		} else
			ckpt_snapshot = txn->ckpt_snapshot;

		/* Write the checkpoint log record. */
		WT_ERR(__wt_struct_size(session, &recsize, fmt,
		    rectype, ckpt_lsn->file, ckpt_lsn->offset,
		    txn->ckpt_nsnapshot, ckpt_snapshot));
		WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));

		WT_ERR(__wt_struct_pack(session,
		    (uint8_t *)logrec->data + logrec->size, recsize, fmt,
		    rectype, ckpt_lsn->file, ckpt_lsn->offset,
		    txn->ckpt_nsnapshot, ckpt_snapshot));
		logrec->size += (uint32_t)recsize;
		WT_ERR(__wt_log_write(session, logrec, lsnp,
		    F_ISSET(S2C(session), WT_CONN_CKPT_SYNC) ?
		    WT_LOG_FSYNC : 0));

		/*
		 * If this full checkpoint completed successfully and there is
		 * no hot backup in progress, tell the logging subsystem the
		 * checkpoint LSN so that it can archive.  Do not update the
		 * logging checkpoint LSN if this is during a clean connection
		 * close, only during a full checkpoint.  A clean close may not
		 * update any metadata LSN and we do not want to archive in
		 * that case.
		 */
		if (!S2C(session)->hot_backup && txn->full_ckpt)
			WT_ERR(__wt_log_ckpt(session, ckpt_lsn));

		/* FALLTHROUGH */
	case WT_TXN_LOG_CKPT_CLEANUP:
		/* Cleanup any allocated resources */
		WT_INIT_LSN(ckpt_lsn);
		txn->ckpt_nsnapshot = 0;
		__wt_scr_free(session, &txn->ckpt_snapshot);
		txn->full_ckpt = 0;
		break;
	WT_ILLEGAL_VALUE_ERR(session);
	}

err:	__wt_logrec_free(session, &logrec);
	return (ret);
}
Beispiel #25
0
/*
 * __wt_lsm_checkpoint_chunk --
 *	Flush a single LSM chunk to disk.
 */
int
__wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
{
	WT_DECL_RET;
	WT_TXN_ISOLATION saved_isolation;
	bool flush_set;

	flush_set = false;

	/*
	 * If the chunk is already checkpointed, make sure it is also evicted.
	 * Either way, there is no point trying to checkpoint it again.
	 */
	if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
	    !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) &&
	    !chunk->evicted) {
		WT_WITH_HANDLE_LIST_LOCK(session,
		    ret = __lsm_discard_handle(session, chunk->uri, NULL));
		if (ret == 0)
			chunk->evicted = 1;
		else if (ret == EBUSY)
			ret = 0;
		else
			WT_RET_MSG(session, ret, "discard handle");
	}
	if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
		__wt_verbose(session, WT_VERB_LSM,
		    "LSM worker %s already on disk",
		    chunk->uri);
		return (0);
	}

	/* Stop if a running transaction needs the chunk. */
	WT_RET(__wt_txn_update_oldest(
	    session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
	if (chunk->switch_txn == WT_TXN_NONE ||
	    !__wt_txn_visible_all(session, chunk->switch_txn)) {
		__wt_verbose(session, WT_VERB_LSM,
		    "LSM worker %s: running transaction, return",
		    chunk->uri);
		return (0);
	}

	if (!__wt_atomic_cas8(&chunk->flushing, 0, 1))
		return (0);
	flush_set = true;

	__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s",
	    chunk->uri);

	/*
	 * Flush the file before checkpointing: this is the expensive part in
	 * terms of I/O.
	 *
	 * !!!
	 * We can wait here for checkpoints and fsyncs to complete, which can
	 * take a long time.
	 */
	if ((ret = __wt_session_get_btree(
	    session, chunk->uri, NULL, NULL, 0)) == 0) {
		/*
		 * Set read-uncommitted: we have already checked that all of the
		 * updates in this chunk are globally visible, use the cheapest
		 * possible check in reconciliation.
		 */
		saved_isolation = session->txn.isolation;
		session->txn.isolation = WT_ISO_READ_UNCOMMITTED;
		ret = __wt_cache_op(session, WT_SYNC_WRITE_LEAVES);
		session->txn.isolation = saved_isolation;
		WT_TRET(__wt_session_release_btree(session));
	}
	WT_ERR(ret);

	__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s",
	    chunk->uri);

	/*
	 * Turn on metadata tracking to ensure the checkpoint gets the
	 * necessary handle locks.
	 *
	 * Ensure that we don't race with a running checkpoint: the checkpoint
	 * lock protects against us racing with an application checkpoint in
	 * this chunk.  Don't wait for it, though: checkpoints can take a long
	 * time, and our checkpoint operation should be very quick.
	 */
	WT_ERR(__wt_meta_track_on(session));
	WT_WITH_CHECKPOINT_LOCK(session, ret,
	    WT_WITH_SCHEMA_LOCK(session, ret,
		ret = __wt_schema_worker(
		session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)));
	WT_TRET(__wt_meta_track_off(session, false, ret != 0));
	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM checkpoint");

	/* Now the file is written, get the chunk size. */
	WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));

	/* Update the flush timestamp to help track ongoing progress. */
	__wt_epoch(session, &lsm_tree->last_flush_ts);
	++lsm_tree->chunks_flushed;

	/* Lock the tree, mark the chunk as on disk and update the metadata. */
	__wt_lsm_tree_writelock(session, lsm_tree);
	F_SET(chunk, WT_LSM_CHUNK_ONDISK);
	ret = __wt_lsm_meta_write(session, lsm_tree);
	++lsm_tree->dsk_gen;

	/* Update the throttle time. */
	__wt_lsm_tree_throttle(session, lsm_tree, true);
	__wt_lsm_tree_writeunlock(session, lsm_tree);
	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM metadata write");

	WT_PUBLISH(chunk->flushing, 0);
	flush_set = false;

	/*
	 * Clear the no-eviction flag so the primary can be evicted and
	 * eventually closed.  Only do this once the checkpoint has succeeded:
	 * otherwise, accessing the leaf page during the checkpoint can trigger
	 * forced eviction.
	 */
	WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0));
	__wt_btree_evictable(session, true);
	WT_ERR(__wt_session_release_btree(session));

	/* Make sure we aren't pinning a transaction ID. */
	__wt_txn_release_snapshot(session);

	__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s",
	    chunk->uri);

	/* Schedule a bloom filter create for our newly flushed chunk. */
	if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF))
		WT_ERR(__wt_lsm_manager_push_entry(
		    session, WT_LSM_WORK_BLOOM, 0, lsm_tree));
	else
		WT_ERR(__wt_lsm_manager_push_entry(
		    session, WT_LSM_WORK_MERGE, 0, lsm_tree));

err:	if (flush_set)
		WT_PUBLISH(chunk->flushing, 0);

	return (ret);
}
Beispiel #26
0
/*
 * __las_page_instantiate --
 *	Instantiate lookaside update records in a recently read page.
 */
static int
__las_page_instantiate(WT_SESSION_IMPL *session,
                       WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size)
{
    WT_CURSOR *cursor;
    WT_CURSOR_BTREE cbt;
    WT_DECL_ITEM(current_key);
    WT_DECL_ITEM(las_addr);
    WT_DECL_ITEM(las_key);
    WT_DECL_ITEM(las_value);
    WT_DECL_RET;
    WT_PAGE *page;
    WT_UPDATE *first_upd, *last_upd, *upd;
    size_t incr, total_incr;
    uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid;
    uint32_t las_id, upd_size, session_flags;
    int exact;
    const uint8_t *p;

    cursor = NULL;
    page = ref->page;
    first_upd = last_upd = upd = NULL;
    total_incr = 0;
    current_recno = recno = WT_RECNO_OOB;
    session_flags = 0;		/* [-Werror=maybe-uninitialized] */

    __wt_btcur_init(session, &cbt);
    __wt_btcur_open(&cbt);

    WT_ERR(__wt_scr_alloc(session, 0, &current_key));
    WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
    WT_ERR(__wt_scr_alloc(session, 0, &las_key));
    WT_ERR(__wt_scr_alloc(session, 0, &las_value));

    /* Open a lookaside table cursor. */
    WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));

    /*
     * The lookaside records are in key and update order, that is, there
     * will be a set of in-order updates for a key, then another set of
     * in-order updates for a subsequent key. We process all of the updates
     * for a key and then insert those updates into the page, then all the
     * updates for the next key, and so on.
     *
     * Search for the block's unique prefix, stepping through any matching
     * records.
     */
    las_addr->data = addr;
    las_addr->size = addr_size;
    las_key->size = 0;
    cursor->set_key(
        cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key);
    if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
        ret = cursor->next(cursor);
    for (; ret == 0; ret = cursor->next(cursor)) {
        WT_ERR(cursor->get_key(cursor,
                               &las_id, las_addr, &las_counter, &las_txnid, las_key));

        /*
         * Confirm the search using the unique prefix; if not a match,
         * we're done searching for records for this page.
         */
        if (las_id != read_id ||
                las_addr->size != addr_size ||
                memcmp(las_addr->data, addr, addr_size) != 0)
            break;

        /*
         * If the on-page value has become globally visible, this record
         * is no longer needed.
         */
        if (__wt_txn_visible_all(session, las_txnid))
            continue;

        /* Allocate the WT_UPDATE structure. */
        WT_ERR(cursor->get_value(
                   cursor, &upd_txnid, &upd_size, las_value));
        WT_ERR(__wt_update_alloc(session,
                                 (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value,
                                 &upd, &incr));
        total_incr += incr;
        upd->txnid = upd_txnid;

        switch (page->type) {
        case WT_PAGE_COL_FIX:
        case WT_PAGE_COL_VAR:
            p = las_key->data;
            WT_ERR(__wt_vunpack_uint(&p, 0, &recno));
            if (current_recno == recno)
                break;
            WT_ASSERT(session, current_recno < recno);

            if (first_upd != NULL) {
                WT_ERR(__col_instantiate(session,
                                         current_recno, ref, &cbt, first_upd));
                first_upd = NULL;
            }
            current_recno = recno;
            break;
        case WT_PAGE_ROW_LEAF:
            if (current_key->size == las_key->size &&
                    memcmp(current_key->data,
                           las_key->data, las_key->size) == 0)
                break;

            if (first_upd != NULL) {
                WT_ERR(__row_instantiate(session,
                                         current_key, ref, &cbt, first_upd));
                first_upd = NULL;
            }
            WT_ERR(__wt_buf_set(session,
                                current_key, las_key->data, las_key->size));
            break;
            WT_ILLEGAL_VALUE_ERR(session);
        }

        /* Append the latest update to the list. */
        if (first_upd == NULL)
            first_upd = last_upd = upd;
        else {
            last_upd->next = upd;
            last_upd = upd;
        }
        upd = NULL;
    }
    WT_ERR_NOTFOUND_OK(ret);

    /* Insert the last set of updates, if any. */
    if (first_upd != NULL)
        switch (page->type) {
        case WT_PAGE_COL_FIX:
        case WT_PAGE_COL_VAR:
            WT_ERR(__col_instantiate(session,
                                     current_recno, ref, &cbt, first_upd));
            first_upd = NULL;
            break;
        case WT_PAGE_ROW_LEAF:
            WT_ERR(__row_instantiate(session,
                                     current_key, ref, &cbt, first_upd));
            first_upd = NULL;
            break;
            WT_ILLEGAL_VALUE_ERR(session);
        }

    /* Discard the cursor. */
    WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags));

    if (total_incr != 0) {
        __wt_cache_page_inmem_incr(session, page, total_incr);

        /*
         * We've modified/dirtied the page, but that's not necessary and
         * if we keep the page clean, it's easier to evict. We leave the
         * lookaside table updates in place, so if we evict this page
         * without dirtying it, any future instantiation of it will find
         * the records it needs. If the page is dirtied before eviction,
         * then we'll write any needed lookaside table records for the
         * new location of the page.
         */
        __wt_page_modify_clear(session, page);
    }

err:
    WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
    WT_TRET(__wt_btcur_close(&cbt, true));

    /*
     * On error, upd points to a single unlinked WT_UPDATE structure,
     * first_upd points to a list.
     */
    if (upd != NULL)
        __wt_free(session, upd);
    if (first_upd != NULL)
        __wt_free_update_list(session, first_upd);

    __wt_scr_free(session, &current_key);
    __wt_scr_free(session, &las_addr);
    __wt_scr_free(session, &las_key);
    __wt_scr_free(session, &las_value);

    return (ret);
}
Beispiel #27
0
/*
 * __wt_lsm_free_chunks --
 *	Try to drop chunks from the tree that are no longer required.
 */
int
__wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_WORKER_COOKIE cookie;
	u_int i, skipped;
	int drop_ret;
	bool flush_metadata;

	flush_metadata = false;

	if (lsm_tree->nold_chunks == 0)
		return (0);

	/*
	 * Make sure only a single thread is freeing the old chunk array
	 * at any time.
	 */
	if (!__wt_atomic_cas32(&lsm_tree->freeing_old_chunks, 0, 1))
		return (0);
	/*
	 * Take a copy of the current state of the LSM tree and look for chunks
	 * to drop.  We do it this way to avoid holding the LSM tree lock while
	 * doing I/O or waiting on the schema lock.
	 *
	 * This is safe because only one thread will be in this function at a
	 * time.  Merges may complete concurrently, and the old_chunks array
	 * may be extended, but we shuffle down the pointers each time we free
	 * one to keep the non-NULL slots at the beginning of the array.
	 */
	WT_CLEAR(cookie);
	WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, true));
	for (i = skipped = 0; i < cookie.nchunks; i++) {
		chunk = cookie.chunk_array[i];
		WT_ASSERT(session, chunk != NULL);
		/* Skip the chunk if another worker is using it. */
		if (chunk->refcnt > 1) {
			++skipped;
			continue;
		}

		/*
		 * Don't remove files if a hot backup is in progress.
		 *
		 * The schema lock protects the set of live files, this check
		 * prevents us from removing a file that hot backup already
		 * knows about.
		 */
		if (S2C(session)->hot_backup)
			break;

		/*
		 * Drop any bloom filters and chunks we can. Don't try to drop
		 * a chunk if the bloom filter drop fails.
		 *  An EBUSY return indicates that a cursor is still open in
		 *       the tree - move to the next chunk in that case.
		 * An ENOENT return indicates that the LSM tree metadata was
		 *       out of sync with the on disk state. Update the
		 *       metadata to match in that case.
		 */
		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
			drop_ret = __lsm_drop_file(session, chunk->bloom_uri);
			if (drop_ret == EBUSY) {
				++skipped;
				continue;
			} else if (drop_ret != ENOENT)
				WT_ERR(drop_ret);

			flush_metadata = true;
			F_CLR(chunk, WT_LSM_CHUNK_BLOOM);
		}
		if (chunk->uri != NULL) {
			drop_ret = __lsm_drop_file(session, chunk->uri);
			if (drop_ret == EBUSY) {
				++skipped;
				continue;
			} else if (drop_ret != ENOENT)
				WT_ERR(drop_ret);
			flush_metadata = true;
		}

		/* Lock the tree to clear out the old chunk information. */
		__wt_lsm_tree_writelock(session, lsm_tree);

		/*
		 * The chunk we are looking at should be the first one in the
		 * tree that we haven't already skipped over.
		 */
		WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk);
		__wt_free(session, chunk->bloom_uri);
		__wt_free(session, chunk->uri);
		__wt_free(session, lsm_tree->old_chunks[skipped]);

		/* Shuffle down to keep all occupied slots at the beginning. */
		if (--lsm_tree->nold_chunks > skipped) {
			memmove(lsm_tree->old_chunks + skipped,
			    lsm_tree->old_chunks + skipped + 1,
			    (lsm_tree->nold_chunks - skipped) *
			    sizeof(WT_LSM_CHUNK *));
			lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL;
		}

		__wt_lsm_tree_writeunlock(session, lsm_tree);

		/*
		 * Clear the chunk in the cookie so we don't attempt to
		 * decrement the reference count.
		 */
		cookie.chunk_array[i] = NULL;
	}

err:	/* Flush the metadata unless the system is in panic */
	if (flush_metadata && ret != WT_PANIC) {
		__wt_lsm_tree_writelock(session, lsm_tree);
		WT_TRET(__wt_lsm_meta_write(session, lsm_tree));
		__wt_lsm_tree_writeunlock(session, lsm_tree);
	}
	__lsm_unpin_chunks(session, &cookie);
	__wt_free(session, cookie.chunk_array);
	lsm_tree->freeing_old_chunks = 0;

	/* Returning non-zero means there is no work to do. */
	if (!flush_metadata)
		WT_TRET(WT_NOTFOUND);

	return (ret);
}
Beispiel #28
0
/*
 * __wt_las_remove_block --
 *	Remove all records matching a key prefix from the lookaside store.
 */
int
__wt_las_remove_block(WT_SESSION_IMPL *session,
                      WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size)
{
    WT_DECL_ITEM(las_addr);
    WT_DECL_ITEM(las_key);
    WT_DECL_RET;
    uint64_t las_counter, las_txnid;
    int64_t remove_cnt;
    uint32_t las_id;
    int exact;

    remove_cnt = 0;

    WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
    WT_ERR(__wt_scr_alloc(session, 0, &las_key));

    /*
     * Search for the block's unique prefix and step through all matching
     * records, removing them.
     */
    las_addr->data = addr;
    las_addr->size = addr_size;
    las_key->size = 0;
    cursor->set_key(
        cursor, btree_id, las_addr, (uint64_t)0, (uint32_t)0, las_key);
    if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
        ret = cursor->next(cursor);
    for (; ret == 0; ret = cursor->next(cursor)) {
        WT_ERR(cursor->get_key(cursor,
                               &las_id, las_addr, &las_counter, &las_txnid, las_key));

        /*
         * Confirm the search using the unique prefix; if not a match,
         * we're done searching for records for this page.
         */
        if (las_id != btree_id ||
                las_addr->size != addr_size ||
                memcmp(las_addr->data, addr, addr_size) != 0)
            break;

        /*
         * Cursor opened overwrite=true: won't return WT_NOTFOUND should
         * another thread remove the record before we do, and the cursor
         * remains positioned in that case.
         */
        WT_ERR(cursor->remove(cursor));
        ++remove_cnt;
    }
    WT_ERR_NOTFOUND_OK(ret);

err:
    __wt_scr_free(session, &las_addr);
    __wt_scr_free(session, &las_key);

    /*
     * If there were races to remove records, we can over-count.  All
     * arithmetic is signed, so underflow isn't fatal, but check anyway so
     * we don't skew low over time.
     */
    if (remove_cnt > S2C(session)->las_record_cnt)
        S2C(session)->las_record_cnt = 0;
    else if (remove_cnt > 0)
        (void)__wt_atomic_subi64(
            &S2C(session)->las_record_cnt, remove_cnt);

    return (ret);
}
Beispiel #29
0
/*
 * __wt_lsm_tree_create --
 *	Create an LSM tree structure for the given name.
 */
int
__wt_lsm_tree_create(WT_SESSION_IMPL *session,
    const char *uri, int exclusive, const char *config)
{
	WT_CONFIG_ITEM cval;
	WT_DECL_ITEM(buf);
	WT_DECL_RET;
	WT_LSM_TREE *lsm_tree;
	const char *cfg[] =
	    { WT_CONFIG_BASE(session, session_create), config, NULL };
	const char *tmpconfig;

	/* If the tree is open, it already exists. */
	if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) {
		__wt_lsm_tree_release(session, lsm_tree);
		return (exclusive ? EEXIST : 0);
	}
	WT_RET_NOTFOUND_OK(ret);

	/*
	 * If the tree has metadata, it already exists.
	 *
	 * !!!
	 * Use a local variable: we don't care what the existing configuration
	 * is, but we don't want to overwrite the real config.
	 */
	if (__wt_metadata_search(session, uri, &tmpconfig) == 0) {
		__wt_free(session, tmpconfig);
		return (exclusive ? EEXIST : 0);
	}
	WT_RET_NOTFOUND_OK(ret);

	WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
	if (WT_STRING_MATCH("r", cval.str, cval.len))
		WT_RET_MSG(session, EINVAL,
		    "LSM trees cannot be configured as column stores");

	WT_RET(__wt_calloc_def(session, 1, &lsm_tree));

	WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));

	WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len,
	    &lsm_tree->key_format));
	WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len,
	    &lsm_tree->value_format));

	WT_ERR(__wt_config_gets(session, cfg, "collator", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len,
	    &lsm_tree->collator_name));

	WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval));
	if (cval.val)
		F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
	else
		F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE);
	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval));
	FLD_SET(lsm_tree->bloom,
	    (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED));
	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval));
	if (cval.val != 0)
		FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST);

	if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) &&
	    FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))
		WT_ERR_MSG(session, EINVAL,
		    "Bloom filters can only be created on newest and oldest "
		    "chunks if bloom filters are enabled");

	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval));
	if (cval.type == WT_CONFIG_ITEM_STRUCT) {
		cval.str++;
		cval.len -= 2;
	}
	WT_ERR(__wt_strndup(session, cval.str, cval.len,
	    &lsm_tree->bloom_config));

	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval));
	lsm_tree->bloom_bit_count = (uint32_t)cval.val;
	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval));
	lsm_tree->bloom_hash_count = (uint32_t)cval.val;
	WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval));
	lsm_tree->chunk_max = (uint64_t)cval.val;
	WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval));
	lsm_tree->chunk_size = (uint64_t)cval.val;
	if (lsm_tree->chunk_size > lsm_tree->chunk_max)
		WT_ERR_MSG(session, EINVAL,
		    "Chunk size (chunk_size) must be smaller than or equal to "
		    "the maximum chunk size (chunk_max)");
	WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval));
	lsm_tree->merge_max = (uint32_t)cval.val;
	lsm_tree->merge_min = lsm_tree->merge_max / 2;
	WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_threads", &cval));
	lsm_tree->merge_threads = (uint32_t)cval.val;
	/* Sanity check that api_data.py is in sync with lsm.h */
	WT_ASSERT(session, lsm_tree->merge_threads <= WT_LSM_MAX_WORKERS);

	/*
	 * Set up the config for each chunk.  If possible, avoid high latencies
	 * from fsync by flushing the cache every 8MB (will be overridden by
	 * any application setting).
	 */
	tmpconfig = "";
#ifdef HAVE_SYNC_FILE_RANGE
	if (!S2C(session)->direct_io)
		tmpconfig = "os_cache_dirty_max=8MB,";
#endif
	WT_ERR(__wt_scr_alloc(session, 0, &buf));
	WT_ERR(__wt_buf_fmt(session, buf,
	    "%s%s,key_format=u,value_format=u", tmpconfig, config));
	lsm_tree->file_config = __wt_buf_steal(session, buf, NULL);

	/* Create the first chunk and flush the metadata. */
	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));

	/* Discard our partially populated handle. */
	ret = __lsm_tree_discard(session, lsm_tree);
	lsm_tree = NULL;

	/*
	 * Open our new tree and add it to the handle cache. Don't discard on
	 * error: the returned handle is NULL on error, and the metadata
	 * tracking macros handle cleaning up on failure.
	 */
	if (ret == 0)
		ret = __lsm_tree_open(session, uri, &lsm_tree);
	if (ret == 0)
		__wt_lsm_tree_release(session, lsm_tree);

	if (0) {
err:		WT_TRET(__lsm_tree_discard(session, lsm_tree));
	}
	__wt_scr_free(&buf);
	return (ret);
}
Beispiel #30
0
/*
 * __wt_txn_commit --
 *	Commit the current transaction.
 */
int
__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_OP *op;
	u_int i;
	bool locked, readonly;
#ifdef HAVE_TIMESTAMPS
	wt_timestamp_t prev_commit_timestamp, ts;
	bool update_timestamp;
#endif

	txn = &session->txn;
	conn = S2C(session);
	txn_global = &conn->txn_global;
	locked = false;

	WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) ||
	    txn->mod_count == 0);

	readonly = txn->mod_count == 0;
	/*
	 * Look for a commit timestamp.
	 */
	WT_ERR(
	    __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval));
	if (cval.len != 0) {
#ifdef HAVE_TIMESTAMPS
		WT_ERR(__wt_txn_parse_timestamp(session, "commit", &ts, &cval));
		WT_ERR(__wt_timestamp_validate(session,
		    "commit", &ts, &cval, true, true, true));
		__wt_timestamp_set(&txn->commit_timestamp, &ts);
		__wt_txn_set_commit_timestamp(session);
#else
		WT_ERR_MSG(session, EINVAL, "commit_timestamp requires a "
		    "version of WiredTiger built with timestamp support");
#endif
	}

#ifdef HAVE_TIMESTAMPS
	/*
	 * Debugging checks on timestamps, if user requested them.
	 */
	if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) &&
	    !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
	    txn->mod_count != 0)
		WT_ERR_MSG(session, EINVAL, "commit_timestamp required and "
		    "none set on this transaction");
	if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) &&
	    F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
	    txn->mod_count != 0)
		WT_ERR_MSG(session, EINVAL, "no commit_timestamp required and "
		    "timestamp set on this transaction");
#endif
	/*
	 * The default sync setting is inherited from the connection, but can
	 * be overridden by an explicit "sync" setting for this transaction.
	 */
	WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval));

	/*
	 * If the user chose the default setting, check whether sync is enabled
	 * for this transaction (either inherited or via begin_transaction).
	 * If sync is disabled, clear the field to avoid the log write being
	 * flushed.
	 *
	 * Otherwise check for specific settings.  We don't need to check for
	 * "on" because that is the default inherited from the connection.  If
	 * the user set anything in begin_transaction, we only override with an
	 * explicit setting.
	 */
	if (cval.len == 0) {
		if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) &&
		    !F_ISSET(txn, WT_TXN_SYNC_SET))
			txn->txn_logsync = 0;
	} else {
		/*
		 * If the caller already set sync on begin_transaction then
		 * they should not be using sync on commit_transaction.
		 * Flag that as an error.
		 */
		if (F_ISSET(txn, WT_TXN_SYNC_SET))
			WT_ERR_MSG(session, EINVAL,
			    "Sync already set during begin_transaction");
		if (WT_STRING_MATCH("background", cval.str, cval.len))
			txn->txn_logsync = WT_LOG_BACKGROUND;
		else if (WT_STRING_MATCH("off", cval.str, cval.len))
			txn->txn_logsync = 0;
		/*
		 * We don't need to check for "on" here because that is the
		 * default to inherit from the connection setting.
		 */
	}

	/* Commit notification. */
	if (txn->notify != NULL)
		WT_ERR(txn->notify->notify(txn->notify,
		    (WT_SESSION *)session, txn->id, 1));

	/*
	 * We are about to release the snapshot: copy values into any
	 * positioned cursors so they don't point to updates that could be
	 * freed once we don't have a snapshot.
	 */
	if (session->ncursors > 0) {
		WT_DIAGNOSTIC_YIELD;
		WT_ERR(__wt_session_copy_values(session));
	}

	/* If we are logging, write a commit log record. */
	if (txn->logrec != NULL &&
	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
	    !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
		/*
		 * We are about to block on I/O writing the log.
		 * Release our snapshot in case it is keeping data pinned.
		 * This is particularly important for checkpoints.
		 */
		__wt_txn_release_snapshot(session);
		/*
		 * We hold the visibility lock for reading from the time
		 * we write our log record until the time we release our
		 * transaction so that the LSN any checkpoint gets will
		 * always reflect visible data.
		 */
		__wt_readlock(session, &txn_global->visibility_rwlock);
		locked = true;
		WT_ERR(__wt_txn_log_commit(session, cfg));
	}

	/* Note: we're going to commit: nothing can fail after this point. */

	/* Process and free updates. */
	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
		switch (op->type) {
		case WT_TXN_OP_BASIC:
		case WT_TXN_OP_BASIC_TS:
		case WT_TXN_OP_INMEM:
			/*
			 * Switch reserved operations to abort to
			 * simplify obsolete update list truncation.
			 */
			if (op->u.upd->type == WT_UPDATE_RESERVED) {
				op->u.upd->txnid = WT_TXN_ABORTED;
				break;
			}

			/*
			 * Writes to the lookaside file can be evicted as soon
			 * as they commit.
			 */
			if (conn->cache->las_fileid != 0 &&
			    op->fileid == conn->cache->las_fileid) {
				op->u.upd->txnid = WT_TXN_NONE;
				break;
			}

#ifdef HAVE_TIMESTAMPS
			if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
			    op->type != WT_TXN_OP_BASIC_TS) {
				WT_ASSERT(session,
				    op->fileid != WT_METAFILE_ID);
				__wt_timestamp_set(&op->u.upd->timestamp,
				    &txn->commit_timestamp);
			}
#endif
			break;

		case WT_TXN_OP_REF:
#ifdef HAVE_TIMESTAMPS
			if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
				__wt_timestamp_set(
				    &op->u.ref->page_del->timestamp,
				    &txn->commit_timestamp);
#endif
			break;

		case WT_TXN_OP_TRUNCATE_COL:
		case WT_TXN_OP_TRUNCATE_ROW:
			/* Other operations don't need timestamps. */
			break;
		}

		__wt_txn_op_free(session, op);
	}
	txn->mod_count = 0;

#ifdef HAVE_TIMESTAMPS
	/*
	 * Track the largest commit timestamp we have seen.
	 *
	 * We don't actually clear the local commit timestamp, just the flag.
	 * That said, we can't update the global commit timestamp until this
	 * transaction is visible, which happens when we release it.
	 */
	update_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_COMMIT);
#endif

	__wt_txn_release(session);
	if (locked)
		__wt_readunlock(session, &txn_global->visibility_rwlock);

#ifdef HAVE_TIMESTAMPS
	/* First check if we've already committed something in the future. */
	if (update_timestamp) {
		WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
		    __wt_timestamp_set(
			&prev_commit_timestamp, &txn_global->commit_timestamp));
		update_timestamp = __wt_timestamp_cmp(
		    &txn->commit_timestamp, &prev_commit_timestamp) > 0;
	}

	/*
	 * If it looks like we need to move the global commit timestamp,
	 * write lock and re-check.
	 */
	if (update_timestamp) {
#if WT_TIMESTAMP_SIZE == 8
		while (__wt_timestamp_cmp(
		    &txn->commit_timestamp, &prev_commit_timestamp) > 0) {
			if (__wt_atomic_cas64(
			    &txn_global->commit_timestamp.val,
			    prev_commit_timestamp.val,
			    txn->commit_timestamp.val)) {
				txn_global->has_commit_timestamp = true;
				break;
			}
		    __wt_timestamp_set(
			&prev_commit_timestamp, &txn_global->commit_timestamp);
		}
#else
		__wt_writelock(session, &txn_global->rwlock);
		if (__wt_timestamp_cmp(&txn->commit_timestamp,
		    &txn_global->commit_timestamp) > 0) {
			__wt_timestamp_set(&txn_global->commit_timestamp,
			    &txn->commit_timestamp);
			txn_global->has_commit_timestamp = true;
		}
		__wt_writeunlock(session, &txn_global->rwlock);
#endif
	}
#endif

	/*
	 * We're between transactions, if we need to block for eviction, it's
	 * a good time to do so.  Note that we must ignore any error return
	 * because the user's data is committed.
	 */
	if (!readonly)
		(void)__wt_cache_eviction_check(session, false, false, NULL);
	return (0);

err:	/*
	 * If anything went wrong, roll back.
	 *
	 * !!!
	 * Nothing can fail after this point.
	 */
	if (locked)
		__wt_readunlock(session, &txn_global->visibility_rwlock);
	WT_TRET(__wt_txn_rollback(session, cfg));
	return (ret);
}