示例#1
0
/*
 * __truncate_table --
 *	WT_SESSION::truncate for a table.
 */
static int
__truncate_table(WT_SESSION_IMPL *session, const char *name)
{
	WT_BTREE *btree;
	WT_DECL_ITEM(namebuf);
	WT_DECL_RET;
	WT_TABLE *table;
	int i;

	WT_RET(__wt_schema_get_table(session, name, strlen(name), 0, &table));
	WT_RET(__wt_scr_alloc(session, 0, &namebuf));

	/* Truncate the column groups. */
	for (i = 0; i < WT_COLGROUPS(table); i++) {
		/*
		 * Get an exclusive lock on the handle: it will be released by
		 * __wt_conn_btree_close_all.
		 */
		WT_ERR(__wt_session_get_btree(session,
		    table->cgroups[i]->source, NULL, NULL, WT_BTREE_EXCLUSIVE));
		btree = session->btree;
		WT_ERR(__wt_buf_set(
		    session, namebuf, btree->name, strlen(btree->name) + 1));
		WT_ERR(__truncate_file(session, namebuf->data));
	}

	/* Truncate the indices. */
	WT_ERR(__wt_schema_open_indices(session, table));
	for (i = 0; i < table->nindices; i++) {
		/*
		 * Get an exclusive lock on the handle: it will be released by
		 * __wt_conn_btree_close_all.
		 */
		WT_ERR(__wt_session_get_btree(session,
		    table->indices[i]->source, NULL, NULL, WT_BTREE_EXCLUSIVE));
		btree = session->btree;
		WT_ERR(__wt_buf_set(
		    session, namebuf, btree->name, strlen(btree->name) + 1));
		WT_ERR(__truncate_file(session, namebuf->data));
	}

	table->idx_complete = 0;

	/* Reopen the column groups. */
	ret = __wt_schema_open_colgroups(session, table);

err:	__wt_scr_free(&namebuf);
	return (ret);
}
示例#2
0
/*
 * __wt_metadata_open --
 *	Opens the metadata file, sets session->meta_dhandle.
 */
int
__wt_metadata_open(WT_SESSION_IMPL *session)
{
	WT_BTREE *btree;

	if (session->meta_dhandle != NULL)
		return (0);

	WT_RET(__wt_session_get_btree(session, WT_METAFILE_URI, NULL, NULL, 0));

	session->meta_dhandle = session->dhandle;
	WT_ASSERT(session, session->meta_dhandle != NULL);

	/* 
	 * Set special flags for the metadata file: eviction (the metadata file
	 * is in-memory and never evicted), logging (the metadata file is always
	 * logged if possible).
	 *
	 * Test flags before setting them so updates can't race in subsequent
	 * opens (the first update is safe because it's single-threaded from
	 * wiredtiger_open).
	 */
	btree = S2BT(session);
	if (!F_ISSET(btree, WT_BTREE_IN_MEMORY))
		F_SET(btree, WT_BTREE_IN_MEMORY);
	if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
		F_SET(btree, WT_BTREE_NO_EVICTION);
	if (F_ISSET(btree, WT_BTREE_NO_LOGGING))
		F_CLR(btree, WT_BTREE_NO_LOGGING);

	/* The metadata handle doesn't need to stay locked -- release it. */
	return (__wt_session_release_btree(session));
}
示例#3
0
/*
 * __truncate_file --
 *	WT_SESSION::truncate for a file.
 */
static int
__truncate_file(WT_SESSION_IMPL *session, const char *name)
{
	WT_DECL_RET;
	const char *filename;
	uint32_t allocsize;

	filename = name;
	if (!WT_PREFIX_SKIP(filename, "file:"))
		return (EINVAL);

	/* Open and lock the file. */
	WT_RET(__wt_session_get_btree(
	    session, name, NULL, NULL, WT_DHANDLE_EXCLUSIVE));

	/* Get the allocation size. */
	allocsize = S2BT(session)->allocsize;

	WT_RET(__wt_session_release_btree(session));

	/* Close any btree handles in the file. */
	WT_WITH_DHANDLE_LOCK(session,
	    ret = __wt_conn_dhandle_close_all(session, name, 0));
	WT_RET(ret);

	/* Delete the root address and truncate the file. */
	WT_RET(__wt_meta_checkpoint_clear(session, name));
	WT_RET(__wt_block_manager_truncate(session, filename, allocsize));

	return (0);
}
示例#4
0
/*
 * __conn_btree_apply_internal --
 *	Apply a function to the open btree handles.
 */
static int
__conn_btree_apply_internal(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle,
    int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
{
	WT_DECL_RET;

	/*
	 * We need to pull the handle into the session handle
	 * cache and make sure it's referenced to stop other
	 * internal code dropping the handle (e.g in LSM when
	 * cleaning up obsolete chunks). Holding the metadata
	 * lock isn't enough.
	 */
	ret = __wt_session_get_btree(session,
	    dhandle->name, dhandle->checkpoint, NULL, 0);
	if (ret == 0) {
		ret = func(session, cfg);
		if (WT_META_TRACKING(session))
			WT_TRET(__wt_meta_track_handle_lock(session, 0));
		else
			WT_TRET(__wt_session_release_btree(session));
	} else if (ret == EBUSY)
		ret = __wt_conn_btree_apply_single(session, dhandle->name,
		    dhandle->checkpoint, func, cfg);
	return (ret);
}
示例#5
0
/*
 * __lsm_discard_handle --
 *	Try to discard a handle from cache.
 */
static int
__lsm_discard_handle(
    WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
{
	WT_DECL_RET;
	int locked;

	/* This will fail with EBUSY if the file is still in use. */
	WT_RET(__wt_session_get_btree(session, uri, checkpoint, NULL,
	    WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY));

	WT_ASSERT(session, S2BT(session)->modified == 0);

	/*
	 * We need the checkpoint lock to discard in-memory handles: otherwise,
	 * an application checkpoint could see this file locked and fail with
	 * EBUSY.
	 *
	 * We can't get the checkpoint lock earlier or it will deadlock with
	 * the schema lock.
	 */
	locked = 0;
	if (checkpoint == NULL && (ret =
	    __wt_spin_trylock(session, &S2C(session)->checkpoint_lock)) == 0)
		locked = 1;
	if (ret == 0)
		F_SET(session->dhandle, WT_DHANDLE_DISCARD);
	WT_TRET(__wt_session_release_btree(session));
	if (locked)
		__wt_spin_unlock(session, &S2C(session)->checkpoint_lock);

	return (ret);
}
示例#6
0
/*
 * __lsm_discard_handle --
 *	Try to discard a handle from cache.
 */
static int
__lsm_discard_handle(
    WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
{
	/* This will fail with EBUSY if the file is still in use. */
	WT_RET(__wt_session_get_btree(session, uri, checkpoint, NULL,
	    WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY));

	F_SET(session->dhandle, WT_DHANDLE_DISCARD_FORCE);
	return (__wt_session_release_btree(session));
}
示例#7
0
/*
 * __backup_list_append --
 *	Append a new file name to the list, allocate space as necessary.
 */
static int
__backup_list_append(
    WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *uri)
{
	WT_CURSOR_BACKUP_ENTRY *p;
	WT_DATA_HANDLE *old_dhandle;
	WT_DECL_RET;
	const char *name;
	int need_handle;

	/* Leave a NULL at the end to mark the end of the list. */
	WT_RET(__wt_realloc_def(session, &cb->list_allocated,
	    cb->list_next + 2, &cb->list));
	p = &cb->list[cb->list_next];
	p[0].name = p[1].name = NULL;
	p[0].handle = p[1].handle = NULL;

	need_handle = 0;
	name = uri;
	if (WT_PREFIX_MATCH(uri, "file:")) {
		need_handle = 1;
		name += strlen("file:");
	}

	/*
	 * !!!
	 * Assumes metadata file entries map one-to-one to physical files.
	 * To support a block manager where that's not the case, we'd need
	 * to call into the block manager and get a list of physical files
	 * that map to this logical "file".  I'm not going to worry about
	 * that for now, that block manager might not even support physical
	 * copying of files by applications.
	 */
	WT_RET(__wt_strdup(session, name, &p->name));

	/*
	 * If it's a file in the database, get a handle for the underlying
	 * object (this handle blocks schema level operations, for example
	 * WT_SESSION.drop or an LSM file discard after level merging).
	 */
	if (need_handle) {
		old_dhandle = session->dhandle;
		if ((ret =
		    __wt_session_get_btree(session, uri, NULL, NULL, 0)) == 0)
			p->handle = session->dhandle;
		session->dhandle = old_dhandle;
		WT_RET(ret);
	}

	++cb->list_next;
	return (0);
}
示例#8
0
/*
 * __wt_metadata_open --
 *	Opens the metadata file, sets session->metafile.
 */
int
__wt_metadata_open(WT_SESSION_IMPL *session)
{
	if (session->metafile != NULL)
		return (0);

	WT_RET(__wt_session_get_btree(session, WT_METADATA_URI, NULL, NULL, 0));

	session->metafile = S2BT(session);
	WT_ASSERT(session, session->metafile != NULL);

	/* The metafile doesn't need to stay locked -- release it. */
	return (__wt_session_release_btree(session));
}
示例#9
0
/*
 * __wt_meta_btree_apply --
 *	Apply a function to all files listed in the metadata, apart from the
 *	metadata file.
 */
int
__wt_meta_btree_apply(WT_SESSION_IMPL *session,
    int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
{
	WT_CURSOR *cursor;
	WT_DATA_HANDLE *saved_dhandle;
	WT_DECL_RET;
	const char *uri;
	int cmp, tret;

	saved_dhandle = session->dhandle;
	WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
	cursor->set_key(cursor, "file:");
	if ((tret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
		tret = cursor->next(cursor);
	for (; tret == 0; tret = cursor->next(cursor)) {
		WT_ERR(cursor->get_key(cursor, &uri));
		if (!WT_PREFIX_MATCH(uri, "file:"))
			break;
		else if (strcmp(uri, WT_METAFILE_URI) == 0)
			continue;

		/*
		 * We need to pull the handle into the session handle cache
		 * and make sure it's referenced to stop other internal code
		 * dropping the handle (e.g in LSM when cleaning up obsolete
		 * chunks).  Holding the metadata lock isn't enough.
		 */
		ret = __wt_session_get_btree(session, uri, NULL, NULL, 0);
		if (ret == 0) {
			WT_SAVE_DHANDLE(session,
			    ret = func(session, cfg));
			if (WT_META_TRACKING(session))
				WT_TRET(
				    __wt_meta_track_handle_lock(session, 0));
			else
				WT_TRET(__wt_session_release_btree(session));
		} else if (ret == EBUSY)
			ret = __wt_conn_btree_apply_single(
			    session, uri, NULL, func, cfg);
		WT_ERR(ret);
	}

	if (tret != WT_NOTFOUND)
		WT_TRET(tret);
err:	WT_TRET(cursor->close(cursor));
	session->dhandle = saved_dhandle;
	return (ret);
}
示例#10
0
/*
 * __drop_file --
 *	Drop a file.
 */
static int
__drop_file(
    WT_SESSION_IMPL *session, const char *uri, int force, const char *cfg[])
{
	WT_DECL_RET;
	int exist;
	const char *filename;

	filename = uri;
	if (!WT_PREFIX_SKIP(filename, "file:"))
		return (EINVAL);

	if (session->btree == NULL &&
	    (ret = __wt_session_get_btree(session, uri, NULL, cfg,
	    WT_BTREE_EXCLUSIVE | WT_BTREE_LOCK_ONLY)) != 0) {
		if (ret == WT_NOTFOUND || ret == ENOENT)
			ret = 0;
		return (ret);
	}

	/* Close all btree handles associated with this file. */
	WT_RET(__wt_conn_btree_close_all(session, uri));

	/* Remove the metadata entry (ignore missing items). */
	WT_TRET(__wt_metadata_remove(session, uri));
	if (force && ret == WT_NOTFOUND)
		ret = 0;

	/* Remove the underlying physical file. */
	exist = 0;
	WT_TRET(__wt_exist(session, filename, &exist));
	if (exist) {
		/*
		 * There is no point tracking this operation: there is no going
		 * back from here.
		 */
		WT_TRET(__wt_remove(session, filename));
	}

	return (ret);
}
示例#11
0
/*
 * __meta_btree_apply --
 *	Apply a function to all files listed in the metadata, apart from the
 *	metadata file.
 */
static inline int
__meta_btree_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor,
    int (*file_func)(WT_SESSION_IMPL *, const char *[]),
    int (*name_func)(WT_SESSION_IMPL *, const char *, bool *),
    const char *cfg[])
{
	WT_DECL_RET;
	const char *uri;
	bool skip;

	while ((ret = cursor->next(cursor)) == 0) {
		WT_RET(cursor->get_key(cursor, &uri));
		if (strcmp(uri, WT_METAFILE_URI) == 0)
			continue;

		skip = false;
		if (name_func != NULL)
			WT_RET(name_func(session, uri, &skip));

		if (file_func == NULL || skip || !WT_PREFIX_MATCH(uri, "file:"))
			continue;

		/*
		 * We need to pull the handle into the session handle cache
		 * and make sure it's referenced to stop other internal code
		 * dropping the handle (e.g in LSM when cleaning up obsolete
		 * chunks).  Holding the metadata lock isn't enough.
		 */
		if ((ret = __wt_session_get_btree(
		    session, uri, NULL, NULL, 0)) != 0)
			return (ret == EBUSY ? 0 : ret);
		WT_SAVE_DHANDLE(session, ret = file_func(session, cfg));
		WT_TRET(__wt_session_release_btree(session));
		WT_RET(ret);
	}
	WT_RET_NOTFOUND_OK(ret);

	return (0);
}
示例#12
0
/*
 * __compact_handle_append --
 *	Gather a file handle to be compacted.
 *	Called via the schema_worker function.
 */
static int
__compact_handle_append(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_DECL_RET;

	WT_UNUSED(cfg);

	WT_RET(__wt_session_get_btree(
	    session, session->dhandle->name, NULL, NULL, 0));

	/* Set compact active on the handle. */
	if ((ret = __compact_start(session)) != 0) {
		WT_TRET(__wt_session_release_btree(session));
		return (ret);
	}

	/* Make sure there is space for the next entry. */
	WT_RET(__wt_realloc_def(session, &session->op_handle_allocated,
	    session->op_handle_next + 1, &session->op_handle));

	session->op_handle[session->op_handle_next++] = session->dhandle;
	return (0);
}
示例#13
0
/*
 * __wt_lsm_checkpoint_chunk --
 *	Flush a single LSM chunk to disk.
 */
int
__wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
{
	WT_DECL_RET;
	WT_TXN_ISOLATION saved_isolation;
	bool flush_set;

	flush_set = false;

	/*
	 * If the chunk is already checkpointed, make sure it is also evicted.
	 * Either way, there is no point trying to checkpoint it again.
	 */
	if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
	    !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) &&
	    !chunk->evicted) {
		WT_WITH_HANDLE_LIST_LOCK(session,
		    ret = __lsm_discard_handle(session, chunk->uri, NULL));
		if (ret == 0)
			chunk->evicted = 1;
		else if (ret == EBUSY)
			ret = 0;
		else
			WT_RET_MSG(session, ret, "discard handle");
	}
	if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
		WT_RET(__wt_verbose(session, WT_VERB_LSM,
		    "LSM worker %s already on disk",
		    chunk->uri));
		return (0);
	}

	/* Stop if a running transaction needs the chunk. */
	__wt_txn_update_oldest(session, true);
	if (chunk->switch_txn == WT_TXN_NONE ||
	    !__wt_txn_visible_all(session, chunk->switch_txn)) {
		WT_RET(__wt_verbose(session, WT_VERB_LSM,
		    "LSM worker %s: running transaction, return",
		    chunk->uri));
		return (0);
	}

	if (!__wt_atomic_cas8(&chunk->flushing, 0, 1))
		return (0);
	flush_set = true;

	WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s",
	    chunk->uri));

	/*
	 * Flush the file before checkpointing: this is the expensive part in
	 * terms of I/O.
	 *
	 * !!!
	 * We can wait here for checkpoints and fsyncs to complete, which can
	 * take a long time.
	 */
	if ((ret = __wt_session_get_btree(
	    session, chunk->uri, NULL, NULL, 0)) == 0) {
		/*
		 * Set read-uncommitted: we have already checked that all of the
		 * updates in this chunk are globally visible, use the cheapest
		 * possible check in reconciliation.
		 */
		saved_isolation = session->txn.isolation;
		session->txn.isolation = WT_ISO_READ_UNCOMMITTED;
		ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES);
		session->txn.isolation = saved_isolation;
		WT_TRET(__wt_session_release_btree(session));
	}
	WT_ERR(ret);

	WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s",
	    chunk->uri));

	/*
	 * Turn on metadata tracking to ensure the checkpoint gets the
	 * necessary handle locks.
	 *
	 * Ensure that we don't race with a running checkpoint: the checkpoint
	 * lock protects against us racing with an application checkpoint in
	 * this chunk.  Don't wait for it, though: checkpoints can take a long
	 * time, and our checkpoint operation should be very quick.
	 */
	WT_ERR(__wt_meta_track_on(session));
	WT_WITH_CHECKPOINT_LOCK(session, ret,
	    WT_WITH_SCHEMA_LOCK(session, ret,
		ret = __wt_schema_worker(
		session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)));
	WT_TRET(__wt_meta_track_off(session, false, ret != 0));
	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM checkpoint");

	/* Now the file is written, get the chunk size. */
	WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));

	/* Update the flush timestamp to help track ongoing progress. */
	WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts));
	++lsm_tree->chunks_flushed;

	/* Lock the tree, mark the chunk as on disk and update the metadata. */
	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
	F_SET(chunk, WT_LSM_CHUNK_ONDISK);
	ret = __wt_lsm_meta_write(session, lsm_tree);
	++lsm_tree->dsk_gen;

	/* Update the throttle time. */
	__wt_lsm_tree_throttle(session, lsm_tree, true);
	WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM metadata write");

	WT_PUBLISH(chunk->flushing, 0);
	flush_set = false;

	/*
	 * Clear the no-eviction flag so the primary can be evicted and
	 * eventually closed.  Only do this once the checkpoint has succeeded:
	 * otherwise, accessing the leaf page during the checkpoint can trigger
	 * forced eviction.
	 */
	WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0));
	__wt_btree_evictable(session, true);
	WT_ERR(__wt_session_release_btree(session));

	/* Make sure we aren't pinning a transaction ID. */
	__wt_txn_release_snapshot(session);

	WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s",
	    chunk->uri));

	/* Schedule a bloom filter create for our newly flushed chunk. */
	if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF))
		WT_ERR(__wt_lsm_manager_push_entry(
		    session, WT_LSM_WORK_BLOOM, 0, lsm_tree));
	else
		WT_ERR(__wt_lsm_manager_push_entry(
		    session, WT_LSM_WORK_MERGE, 0, lsm_tree));

err:	if (flush_set)
		WT_PUBLISH(chunk->flushing, 0);

	return (ret);
}
示例#14
0
/*
 * __create_file --
 *	Create a new 'file:' object.
 */
static int
__create_file(WT_SESSION_IMPL *session,
    const char *uri, int exclusive, const char *config)
{
	WT_DECL_ITEM(val);
	WT_DECL_RET;
	uint32_t allocsize;
	int is_metadata;
	const char *filename, **p, *filecfg[] =
	    { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL };
	char *fileconf;

	fileconf = NULL;

	is_metadata = strcmp(uri, WT_METAFILE_URI) == 0;

	filename = uri;
	if (!WT_PREFIX_SKIP(filename, "file:"))
		WT_RET_MSG(session, EINVAL, "Expected a 'file:' URI: %s", uri);

	/* Check if the file already exists. */
	if (!is_metadata && (ret =
	    __wt_metadata_search(session, uri, &fileconf)) != WT_NOTFOUND) {
		if (exclusive)
			WT_TRET(EEXIST);
		goto err;
	}

	/* Sanity check the allocation size. */
	WT_RET(__wt_direct_io_size_check(
	    session, filecfg, "allocation_size", &allocsize));

	/* Create the file. */
	WT_ERR(__wt_block_manager_create(session, filename, allocsize));
	if (WT_META_TRACKING(session))
		WT_ERR(__wt_meta_track_fileop(session, NULL, uri));

	/*
	 * If creating an ordinary file, append the file ID and current version
	 * numbers to the passed-in configuration and insert the resulting
	 * configuration into the metadata.
	 */
	if (!is_metadata) {
		WT_ERR(__wt_scr_alloc(session, 0, &val));
		WT_ERR(__wt_buf_fmt(session, val,
		    "id=%" PRIu32 ",version=(major=%d,minor=%d)",
		    ++S2C(session)->next_file_id,
		    WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX));
		for (p = filecfg; *p != NULL; ++p)
			;
		*p = val->data;
		WT_ERR(__wt_config_collapse(session, filecfg, &fileconf));
		WT_ERR(__wt_metadata_insert(session, uri, fileconf));
	}

	/*
	 * Open the file to check that it was setup correctly. We don't need to
	 * pass the configuration, we just wrote the collapsed configuration
	 * into the metadata file, and it's going to be read/used by underlying
	 * functions.
	 *
	 * Keep the handle exclusive until it is released at the end of the
	 * call, otherwise we could race with a drop.
	 */
	WT_ERR(__wt_session_get_btree(
	    session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
	if (WT_META_TRACKING(session))
		WT_ERR(__wt_meta_track_handle_lock(session, 1));
	else
		WT_ERR(__wt_session_release_btree(session));

err:	__wt_scr_free(session, &val);
	__wt_free(session, fileconf);
	return (ret);
}
示例#15
0
/*
 * __wt_lsm_checkpoint_worker --
 *	A worker thread for an LSM tree, responsible for flushing new chunks to
 *	disk.
 */
void *
__wt_lsm_checkpoint_worker(void *arg)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	WT_LSM_WORKER_COOKIE cookie;
	WT_SESSION_IMPL *session;
	WT_TXN_ISOLATION saved_isolation;
	u_int i, j;
	int locked;

	lsm_tree = arg;
	session = lsm_tree->ckpt_session;

	WT_CLEAR(cookie);

	while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) {
		if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
			WT_WITH_SCHEMA_LOCK(session, ret =
			    __wt_lsm_tree_switch(session, lsm_tree));
			WT_ERR(ret);
		}

		WT_ERR(__lsm_copy_chunks(session, lsm_tree, &cookie, 0));

		/* Write checkpoints in all completed files. */
		for (i = 0, j = 0; i < cookie.nchunks - 1; i++) {
			if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING))
				goto err;

			if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
				break;

			chunk = cookie.chunk_array[i];

			/* Stop if a running transaction needs the chunk. */
			__wt_txn_update_oldest(session);
			if (!__wt_txn_visible_all(session, chunk->txnid_max))
				break;

			/*
			 * If the chunk is already checkpointed, make sure it
			 * is also evicted.  Either way, there is no point
			 * trying to checkpoint it again.
			 */
			if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK)) {
				if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_EVICTED))
					continue;

				if ((ret = __lsm_discard_handle(
				    session, chunk->uri, NULL)) == 0)
					F_SET_ATOMIC(
					    chunk, WT_LSM_CHUNK_EVICTED);
				else if (ret == EBUSY)
					ret = 0;
				else
					WT_ERR_MSG(session, ret,
					    "discard handle");
				continue;
			}

			WT_VERBOSE_ERR(session, lsm,
			     "LSM worker flushing %u", i);

			/*
			 * Flush the file before checkpointing: this is the
			 * expensive part in terms of I/O: do it without
			 * holding the schema lock.
			 *
			 * Use the special eviction isolation level to avoid
			 * interfering with an application checkpoint: we have
			 * already checked that all of the updates in this
			 * chunk are globally visible.
			 *
			 * !!! We can wait here for checkpoints and fsyncs to
			 * complete, which can be a long time.
			 *
			 * Don't keep waiting for the lock if application
			 * threads are waiting for a switch.  Don't skip
			 * flushing the leaves either: that just means we'll
			 * hold the schema lock for (much) longer, which blocks
			 * the world.
			 */
			WT_ERR(__wt_session_get_btree(
			    session, chunk->uri, NULL, NULL, 0));
			for (locked = 0;
			    !locked && ret == 0 &&
			    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);) {
				if ((ret = __wt_spin_trylock(session,
				    &S2C(session)->checkpoint_lock)) == 0)
					locked = 1;
				else if (ret == EBUSY) {
					__wt_yield();
					ret = 0;
				}
			}
			if (locked) {
				saved_isolation = session->txn.isolation;
				session->txn.isolation = TXN_ISO_EVICTION;
				ret = __wt_bt_cache_op(
				    session, NULL, WT_SYNC_WRITE_LEAVES);
				session->txn.isolation = saved_isolation;
				__wt_spin_unlock(
				    session, &S2C(session)->checkpoint_lock);
			}
			WT_TRET(__wt_session_release_btree(session));
			WT_ERR(ret);

			if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
				break;

			WT_VERBOSE_ERR(session, lsm,
			     "LSM worker checkpointing %u", i);

			WT_WITH_SCHEMA_LOCK(session,
			    ret = __wt_schema_worker(session, chunk->uri,
			    __wt_checkpoint, NULL, NULL, 0));

			if (ret != 0) {
				__wt_err(session, ret, "LSM checkpoint");
				break;
			}

			WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
			/*
			 * Clear the "cache resident" flag so the primary can
			 * be evicted and eventually closed.  Only do this once
			 * the checkpoint has succeeded: otherwise, accessing
			 * the leaf page during the checkpoint can trigger
			 * forced eviction.
			 */
			WT_ERR(__wt_session_get_btree(
			    session, chunk->uri, NULL, NULL, 0));
			__wt_btree_evictable(session, 1);
			WT_ERR(__wt_session_release_btree(session));

			++j;
			WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1));
			F_SET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK);
			ret = __wt_lsm_meta_write(session, lsm_tree);
			++lsm_tree->dsk_gen;

			/* Update the throttle time. */
			__wt_lsm_tree_throttle(session, lsm_tree);
			WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));

			/* Make sure we aren't pinning a transaction ID. */
			__wt_txn_release_snapshot(session);

			if (ret != 0) {
				__wt_err(session, ret,
				    "LSM checkpoint metadata write");
				break;
			}

			WT_VERBOSE_ERR(session, lsm,
			     "LSM worker checkpointed %u", i);
		}
		__lsm_unpin_chunks(session, &cookie);
		if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) &&
		    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
			WT_ERR_TIMEDOUT_OK(__wt_cond_wait(
			    session, lsm_tree->work_cond, 100000));
	}
err:	__lsm_unpin_chunks(session, &cookie);
	__wt_free(session, cookie.chunk_array);
	/*
	 * The thread will only exit with failure if we run out of memory or
	 * there is some other system driven failure. We can't keep going
	 * after such a failure - ensure WiredTiger shuts down.
	 */
	if (ret != 0 && ret != WT_NOTFOUND)
		WT_PANIC_ERR(session, ret,
		    "Shutting down LSM checkpoint utility thread");
	return (NULL);
}
示例#16
0
/*
 * __wt_lsm_checkpoint_chunk --
 *	Flush a single LSM chunk to disk.
 */
int
__wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
{
	WT_DECL_RET;
	WT_TXN_ISOLATION saved_isolation;

	/*
	 * If the chunk is already checkpointed, make sure it is also evicted.
	 * Either way, there is no point trying to checkpoint it again.
	 */
	if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
	    !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) &&
	    !chunk->evicted) {
		if ((ret = __lsm_discard_handle(
		    session, chunk->uri, NULL)) == 0)
			chunk->evicted = 1;
		else if (ret == EBUSY)
			ret = 0;
		else
			WT_RET_MSG(session, ret, "discard handle");
	}
	if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
		WT_RET(__wt_verbose(session, WT_VERB_LSM,
		    "LSM worker %s already on disk",
		    chunk->uri));
		return (0);
	}

	/* Stop if a running transaction needs the chunk. */
	__wt_txn_update_oldest(session);
	if (chunk->switch_txn == WT_TXN_NONE ||
	    !__wt_txn_visible_all(session, chunk->switch_txn)) {
		WT_RET(__wt_verbose(session, WT_VERB_LSM,
		    "LSM worker %s: running transaction, return",
		    chunk->uri));
		return (0);
	}

	WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s",
	    chunk->uri));

	/*
	 * Flush the file before checkpointing: this is the expensive part in
	 * terms of I/O.
	 *
	 * Use the special eviction isolation level to avoid interfering with
	 * an application checkpoint: we have already checked that all of the
	 * updates in this chunk are globally visible.
	 *
	 * !!! We can wait here for checkpoints and fsyncs to complete, which
	 * can be a long time.
	 */
	if ((ret = __wt_session_get_btree(
	    session, chunk->uri, NULL, NULL, 0)) == 0) {
		saved_isolation = session->txn.isolation;
		session->txn.isolation = TXN_ISO_EVICTION;
		ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES);
		session->txn.isolation = saved_isolation;
		WT_TRET(__wt_session_release_btree(session));
	}
	WT_RET(ret);

	WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s",
	    chunk->uri));

	WT_WITH_SCHEMA_LOCK(session,
	    ret = __wt_schema_worker(session, chunk->uri,
	    __wt_checkpoint, NULL, NULL, 0));

	if (ret != 0)
		WT_RET_MSG(session, ret, "LSM checkpoint");

	/* Now the file is written, get the chunk size. */
	WT_RET(__wt_lsm_tree_set_chunk_size(session, chunk));

	/* Update the flush timestamp to help track ongoing progress. */
	WT_RET(__wt_epoch(session, &lsm_tree->last_flush_ts));

	/* Lock the tree, mark the chunk as on disk and update the metadata. */
	WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
	F_SET(chunk, WT_LSM_CHUNK_ONDISK);
	ret = __wt_lsm_meta_write(session, lsm_tree);
	++lsm_tree->dsk_gen;

	/* Update the throttle time. */
	__wt_lsm_tree_throttle(session, lsm_tree, 1);
	WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));

	if (ret != 0)
		WT_RET_MSG(session, ret, "LSM metadata write");

	/*
	 * Clear the no-eviction flag so the primary can be evicted and
	 * eventually closed.  Only do this once the checkpoint has succeeded:
	 * otherwise, accessing the leaf page during the checkpoint can trigger
	 * forced eviction.
	 */
	WT_RET(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0));
	__wt_btree_evictable(session, 1);
	WT_RET(__wt_session_release_btree(session));

	/* Make sure we aren't pinning a transaction ID. */
	__wt_txn_release_snapshot(session);

	WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s",
	    chunk->uri));

	/* Schedule a bloom filter create for our newly flushed chunk. */
	if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF))
		WT_RET(__wt_lsm_manager_push_entry(
		    session, WT_LSM_WORK_BLOOM, 0, lsm_tree));
	else
		WT_RET(__wt_lsm_manager_push_entry(
		    session, WT_LSM_WORK_MERGE, 0, lsm_tree));
	return (0);
}