예제 #1
0
파일: cur_backup.c 프로젝트: DINKIN/mongo
/*
 * __backup_stop --
 *	Stop a backup.
 */
static int
__backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	int i;

	conn = S2C(session);

	/* Release all btree names held by the backup. */
	__wt_writelock(session, &conn->hot_backup_lock);
	conn->hot_backup_list = NULL;
	__wt_writeunlock(session, &conn->hot_backup_lock);
	if (cb->list != NULL) {
		for (i = 0; cb->list[i] != NULL; ++i)
			__wt_free(session, cb->list[i]);
		__wt_free(session, cb->list);
	}

	/* Remove any backup specific file. */
	WT_TRET(__wt_backup_file_remove(session));

	/* Checkpoint deletion can proceed, as can the next hot backup. */
	__wt_writelock(session, &conn->hot_backup_lock);
	conn->hot_backup = false;
	__wt_writeunlock(session, &conn->hot_backup_lock);

	return (ret);
}
예제 #2
0
/*
 * __wt_log_truncate_files --
 *	Truncate log files via archive once. Requires that the server is not
 *	currently running.
 */
int
__wt_log_truncate_files(
    WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[])
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	uint32_t backup_file, locked;

	WT_UNUSED(cfg);
	conn = S2C(session);
	log = conn->log;
	if (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE))
		WT_RET_MSG(session, EINVAL,
		    "Attempt to archive manually while a server is running");

	backup_file = 0;
	if (cursor != NULL)
		backup_file = WT_CURSOR_BACKUP_ID(cursor);
	WT_ASSERT(session, backup_file <= log->alloc_lsn.file);
	WT_RET(__wt_verbose(session, WT_VERB_LOG,
	    "log_truncate_files: Archive once up to %" PRIu32,
	    backup_file));
	WT_RET(__wt_writelock(session, log->log_archive_lock));
	locked = 1;
	WT_ERR(__log_archive_once(session, backup_file));
	WT_ERR(__wt_writeunlock(session, log->log_archive_lock));
	locked = 0;
err:
	if (locked)
		WT_RET(__wt_writeunlock(session, log->log_archive_lock));
	return (ret);
}
예제 #3
0
파일: conn_log.c 프로젝트: mpobrien/mongo
/*
 * __wt_log_truncate_files --
 *	Truncate log files via archive once. Requires that the server is not
 *	currently running.
 */
int
__wt_log_truncate_files(WT_SESSION_IMPL *session, WT_CURSOR *cursor, bool force)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	uint32_t backup_file;

	conn = S2C(session);
	if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
		return (0);
	if (!force && F_ISSET(conn, WT_CONN_SERVER_LOG) &&
	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE))
		WT_RET_MSG(session, EINVAL,
		    "Attempt to archive manually while a server is running");

	log = conn->log;

	backup_file = 0;
	if (cursor != NULL) {
		WT_ASSERT(session, force == false);
		backup_file = WT_CURSOR_BACKUP_ID(cursor);
	}
	WT_ASSERT(session, backup_file <= log->alloc_lsn.l.file);
	__wt_verbose(session, WT_VERB_LOG,
	    "log_truncate_files: Archive once up to %" PRIu32, backup_file);

	__wt_writelock(session, &log->log_archive_lock);
	ret = __log_archive_once(session, backup_file);
	__wt_writeunlock(session, &log->log_archive_lock);
	return (ret);
}
예제 #4
0
/*
 * __wt_lsm_tree_writelock --
 *	Acquire an exclusive lock on an LSM tree.
 */
int
__wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_RET(__wt_writelock(session, lsm_tree->rwlock));

	/*
	 * Diagnostic: avoid deadlocks with the schema lock: if we need it for
	 * an operation, we should already have it.
	 */
	F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
	return (0);
}
예제 #5
0
/*
 * __wt_lsm_tree_lock --
 *	Lock an LSM tree for reading or writing.
 */
int
__wt_lsm_tree_lock(
    WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int exclusive)
{
	/*
	 * Diagnostic: avoid deadlocks with the schema lock: if we need it for
	 * an operation, we should already have it.
	 */
	F_SET(session, WT_SESSION_NO_SCHEMA_LOCK);

	if (exclusive)
		return (__wt_writelock(session, lsm_tree->rwlock));
	else
		return (__wt_readlock(session, lsm_tree->rwlock));
}
예제 #6
0
파일: bt_ovfl.c 프로젝트: qihsh/mongo
/*
 * __wt_ovfl_discard --
 *	Discard an on-page overflow value, and reset the page's cell.
 */
int
__wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_DECL_RET;

	btree = S2BT(session);
	bm = btree->bm;
	unpack = &_unpack;

	__wt_cell_unpack(cell, unpack);

	/*
	 * Finally remove overflow key/value objects, called when reconciliation
	 * finishes after successfully writing a page.
	 *
	 * Keys must have already been instantiated and value objects must have
	 * already been cached (if they might potentially still be read by any
	 * running transaction).
	 *
	 * Acquire the overflow lock to avoid racing with a thread reading the
	 * backing overflow blocks.
	 */
	WT_RET(__wt_writelock(session, btree->ovfl_lock));

	switch (unpack->raw) {
	case WT_CELL_KEY_OVFL:
		__wt_cell_type_reset(session,
		    unpack->cell, WT_CELL_KEY_OVFL, WT_CELL_KEY_OVFL_RM);
		break;
	case WT_CELL_VALUE_OVFL:
		__wt_cell_type_reset(session,
		    unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM);
		break;
	WT_ILLEGAL_VALUE(session);
	}

	WT_TRET(__wt_writeunlock(session, btree->ovfl_lock));

	/* Free the backing disk blocks. */
	WT_TRET(bm->free(bm, session, unpack->data, unpack->size));

	return (ret);
}
예제 #7
0
/*
 * __wt_thread_group_resize --
 *	Resize an array of utility threads taking the lock.
 */
int
__wt_thread_group_resize(
    WT_SESSION_IMPL *session, WT_THREAD_GROUP *group,
    uint32_t new_min, uint32_t new_max, uint32_t flags)
{
	WT_DECL_RET;

	__wt_verbose(session, WT_VERB_THREAD_GROUP,
	    "Resize thread group: %p, from min: %" PRIu32 " -> %" PRIu32
	    " from max: %" PRIu32 " -> %" PRIu32,
	    (void *)group, group->min, new_min, group->max, new_max);

	__wt_writelock(session, group->lock);
	WT_TRET(__thread_group_resize(session, group, new_min, new_max, flags));
	__wt_writeunlock(session, group->lock);
	return (ret);
}
예제 #8
0
/*
 * __backup_stop --
 *	Stop a backup.
 */
static int
__backup_stop(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;

	conn = S2C(session);

	/* Remove any backup specific file. */
	ret = __wt_backup_file_remove(session);

	/* Checkpoint deletion can proceed, as can the next hot backup. */
	WT_TRET(__wt_writelock(session, conn->hot_backup_lock));
	conn->hot_backup = false;
	WT_TRET(__wt_writeunlock(session, conn->hot_backup_lock));

	return (ret);
}
예제 #9
0
/*
 * __wt_thread_group_start_one --
 *	Start a new thread if possible.
 */
int
__wt_thread_group_start_one(
    WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait)
{
	WT_DECL_RET;

	if (group->current_threads >= group->max)
		return (0);

	if (wait)
		__wt_writelock(session, group->lock);
	else if (__wt_try_writelock(session, group->lock) != 0)
		return (0);

	/* Recheck the bounds now that we hold the lock */
	if (group->current_threads < group->max)
		WT_TRET(__thread_group_grow(
		    session, group, group->current_threads + 1));
	__wt_writeunlock(session, group->lock);

	return (ret);
}
예제 #10
0
파일: rec_track.c 프로젝트: Andiry/mongo
/*
 * __wt_ovfl_track_wrapup_err --
 *	Resolve the page's overflow tracking on reconciliation error.
 */
int
__wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_DECL_RET;
	WT_OVFL_TRACK *track;

	if (page->modify == NULL || page->modify->ovfl_track == NULL)
		return (0);

	track = page->modify->ovfl_track;
	if (track->discard != NULL)
		WT_RET(__ovfl_discard_wrapup_err(session, page));

	if (track->ovfl_reuse[0] != NULL)
		WT_RET(__ovfl_reuse_wrapup_err(session, page));

	if (track->ovfl_txnc[0] != NULL) {
		WT_RET(__wt_writelock(session, S2BT(session)->ovfl_lock));
		ret = __ovfl_txnc_wrapup(session, page);
		WT_TRET(__wt_writeunlock(session, S2BT(session)->ovfl_lock));
	}
	return (0);
}
예제 #11
0
/*
 * __wt_thread_group_create --
 *	Create a new thread group, assumes incoming group structure is
 *	zero initialized.
 */
int
__wt_thread_group_create(
    WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, const char *name,
    uint32_t min, uint32_t max, uint32_t flags,
    int (*run_func)(WT_SESSION_IMPL *session, WT_THREAD *context))
{
	WT_DECL_RET;
	bool cond_alloced;

	/* Check that the structure is initialized as expected */
	WT_ASSERT(session, group->alloc == 0);

	cond_alloced = false;

	__wt_verbose(session, WT_VERB_THREAD_GROUP,
	    "Creating thread group: %p", (void *)group);

	WT_RET(__wt_rwlock_alloc(session, &group->lock, "Thread group"));
	WT_ERR(__wt_cond_alloc(
	    session, "Thread group cond", false, &group->wait_cond));
	cond_alloced = true;

	__wt_writelock(session, group->lock);
	group->run_func = run_func;
	group->name = name;

	WT_TRET(__thread_group_resize(session, group, min, max, flags));
	__wt_writeunlock(session, group->lock);

	/* Cleanup on error to avoid leaking resources */
err:	if (ret != 0) {
		if (cond_alloced)
			WT_TRET(__wt_cond_destroy(session, &group->wait_cond));
		__wt_rwlock_destroy(session, &group->lock);
	}
	return (ret);
}
예제 #12
0
/*
 * __snapshot_worker --
 *	Snapshot the tree.
 */
static int
__snapshot_worker(
    WT_SESSION_IMPL *session, const char *name, int discard, snapshot_op op)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_SNAPSHOT *deleted, *snap, *snapbase;
	int force, matched, tracked;

	btree = session->btree;
	matched = tracked = 0;
	snap = snapbase = NULL;

	/* Snapshots are single-threaded. */
	__wt_writelock(session, btree->snaplock);

	/* Set the name to the default, if we aren't provided one. */
	if (op == SNAPSHOT && name == NULL) {
		force = 0;
		name = WT_INTERNAL_SNAPSHOT;
	} else
		force = 1;

	/*
	 * Get the list of snapshots for this file.  If there's no reference,
	 * this file is dead.  Discard it from the cache without bothering to
	 * write any dirty pages.
	 */
	if ((ret =
	    __wt_meta_snaplist_get(session, btree->name, &snapbase)) != 0) {
		if (ret == WT_NOTFOUND)
			ret = __wt_bt_cache_flush(
			    session, NULL, WT_SYNC_DISCARD_NOWRITE, 0);
		goto err;
	}

	switch (op) {
	case SNAPSHOT:
		/*
		 * Create a new, possibly named, snapshot.  Review existing
		 * snapshots, deleting default snapshots and snapshots with
		 * matching names, add the new snapshot entry at the end of
		 * the list.
		 */
		WT_SNAPSHOT_FOREACH(snapbase, snap)
			if (strcmp(snap->name, name) == 0 ||
			    strcmp(snap->name, WT_INTERNAL_SNAPSHOT) == 0)
				F_SET(snap, WT_SNAP_DELETE);

		WT_ERR(__wt_strdup(session, name, &snap->name));
		F_SET(snap, WT_SNAP_ADD);
		break;
	case SNAPSHOT_DROP:
		/*
		 * Drop all snapshots with matching names.
		 * Drop all snapshots with the default name.
		 * Add a new snapshot with the default name.
		 */
		WT_SNAPSHOT_FOREACH(snapbase, snap) {
			/*
			 * There should be only one snapshot with a matching
			 * name, but it doesn't hurt to check the rest.
			 */
			if (strcmp(snap->name, name) == 0)
				matched = 1;
			else if (strcmp(snap->name, WT_INTERNAL_SNAPSHOT) != 0)
				continue;
			F_SET(snap, WT_SNAP_DELETE);
		}
		if (!matched)
			goto nomatch;

		WT_ERR(__wt_strdup(session, WT_INTERNAL_SNAPSHOT, &snap->name));
		F_SET(snap, WT_SNAP_ADD);
		break;
	case SNAPSHOT_DROP_ALL:
		/*
		 * Drop all snapshots.
		 * Add a new snapshot with the default name.
		 */
		WT_SNAPSHOT_FOREACH(snapbase, snap)
			F_SET(snap, WT_SNAP_DELETE);

		WT_ERR(__wt_strdup(session, WT_INTERNAL_SNAPSHOT, &snap->name));
		F_SET(snap, WT_SNAP_ADD);
		break;
	case SNAPSHOT_DROP_FROM:
		/*
		 * Drop all snapshots after, and including, the named snapshot.
		 * Drop all snapshots with the default name.
		 * Add a new snapshot with the default name.
		 */
		WT_SNAPSHOT_FOREACH(snapbase, snap) {
			if (strcmp(snap->name, name) == 0)
				matched = 1;
			if (matched ||
			    strcmp(snap->name, WT_INTERNAL_SNAPSHOT) == 0)
				F_SET(snap, WT_SNAP_DELETE);
		}
		if (!matched)
			goto nomatch;

		WT_ERR(__wt_strdup(session, WT_INTERNAL_SNAPSHOT, &snap->name));
		F_SET(snap, WT_SNAP_ADD);
		break;
	case SNAPSHOT_DROP_TO:
		/*
		 * Drop all snapshots before, and including, the named snapshot.
		 * Drop all snapshots with the default name.
		 * Add a new snapshot with the default name.
		 */
		WT_SNAPSHOT_FOREACH(snapbase, snap) {
			if (!matched ||
			    strcmp(snap->name, WT_INTERNAL_SNAPSHOT) == 0)
				F_SET(snap, WT_SNAP_DELETE);
			if (strcmp(snap->name, name) == 0)
				matched = 1;
		}
		if (!matched)
nomatch:		WT_ERR_MSG(session,
			    EINVAL, "no snapshot named %s was found", name);

		WT_ERR(__wt_strdup(session, WT_INTERNAL_SNAPSHOT, &snap->name));
		F_SET(snap, WT_SNAP_ADD);
		break;
	}

	/*
	 * Lock the snapshots that will be deleted.
	 *
	 * Snapshots are only locked when tracking is enabled, which covers
	 * sync and drop operations, but not close.  The reasoning is that
	 * there should be no access to a snapshot during close, because any
	 * thread accessing a snapshot will also have the current file handle
	 * open.
	 */
	if (WT_META_TRACKING(session))
		WT_SNAPSHOT_FOREACH(snapbase, deleted)
			if (F_ISSET(deleted, WT_SNAP_DELETE))
				WT_ERR(__wt_session_lock_snapshot(session,
				    deleted->name, WT_BTREE_EXCLUSIVE));

	WT_ERR(__wt_bt_cache_flush(
	    session, snapbase, discard ? WT_SYNC_DISCARD : WT_SYNC, force));

	/* If there was a snapshot, update the metadata. */
	if (snap->raw.data == NULL) {
		if (force)
			WT_ERR_MSG(session,
			    EINVAL, "cache flush failed to create a snapshot");
	} else {
		WT_ERR(__wt_meta_snaplist_set(session, btree->name, snapbase));
		/*
		 * If tracking is enabled, defer making pages available until
		 * the end of the transaction.  The exception is if the handle
		 * is being discarded: in that case, it will be gone by the
		 * time we try to apply or unroll the meta tracking event.
		 */
		if (WT_META_TRACKING(session) && !discard) {
			WT_ERR(__wt_meta_track_checkpoint(session));
			tracked = 1;
		} else
			WT_ERR(__wt_bm_snapshot_resolve(session, snapbase));
	}

err:	__wt_meta_snaplist_free(session, snapbase);
	if (!tracked)
		__wt_rwunlock(session, btree->snaplock);

	return (ret);
}
예제 #13
0
파일: txn.c 프로젝트: judahschvimer/mongo
/*
 * __wt_txn_update_oldest --
 *	Sweep the running transactions to update the oldest ID required.
 */
int
__wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_SESSION_IMPL *oldest_session;
	WT_TXN_GLOBAL *txn_global;
	uint64_t current_id, last_running, oldest_id;
	uint64_t prev_last_running, prev_oldest_id;
	bool strict, wait;

	conn = S2C(session);
	txn_global = &conn->txn_global;
	strict = LF_ISSET(WT_TXN_OLDEST_STRICT);
	wait = LF_ISSET(WT_TXN_OLDEST_WAIT);

	current_id = last_running = txn_global->current;
	prev_last_running = txn_global->last_running;
	prev_oldest_id = txn_global->oldest_id;

	/*
	 * For pure read-only workloads, or if the update isn't forced and the
	 * oldest ID isn't too far behind, avoid scanning.
	 */
	if (prev_oldest_id == current_id ||
	    (!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100)))
		return (0);

	/* First do a read-only scan. */
	if (wait)
		__wt_readlock(session, txn_global->scan_rwlock);
	else if ((ret =
	    __wt_try_readlock(session, txn_global->scan_rwlock)) != 0)
		return (ret == EBUSY ? 0 : ret);
	__txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session);
	__wt_readunlock(session, txn_global->scan_rwlock);

	/*
	 * If the state hasn't changed (or hasn't moved far enough for
	 * non-forced updates), give up.
	 */
	if ((oldest_id == prev_oldest_id ||
	    (!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) &&
	    ((last_running == prev_last_running) ||
	    (!strict && WT_TXNID_LT(last_running, prev_last_running + 100))))
		return (0);

	/* It looks like an update is necessary, wait for exclusive access. */
	if (wait)
		__wt_writelock(session, txn_global->scan_rwlock);
	else if ((ret =
	    __wt_try_writelock(session, txn_global->scan_rwlock)) != 0)
		return (ret == EBUSY ? 0 : ret);

	/*
	 * If the oldest ID has been updated while we waited, don't bother
	 * scanning.
	 */
	if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) &&
	    WT_TXNID_LE(last_running, txn_global->last_running))
		goto done;

	/*
	 * Re-scan now that we have exclusive access.  This is necessary because
	 * threads get transaction snapshots with read locks, and we have to be
	 * sure that there isn't a thread that has got a snapshot locally but
	 * not yet published its snap_min.
	 */
	__txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session);

#ifdef HAVE_DIAGNOSTIC
	{
	/*
	 * Make sure the ID doesn't move past any named snapshots.
	 *
	 * Don't include the read/assignment in the assert statement.  Coverity
	 * complains if there are assignments only done in diagnostic builds,
	 * and when the read is from a volatile.
	 */
	uint64_t id = txn_global->nsnap_oldest_id;
	WT_ASSERT(session,
	    id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
	}
#endif
	/* Update the oldest ID. */
	if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
		txn_global->oldest_id = oldest_id;
	if (WT_TXNID_LT(txn_global->last_running, last_running)) {
		txn_global->last_running = last_running;

#ifdef HAVE_VERBOSE
		/* Output a verbose message about long-running transactions,
		 * but only when some progress is being made. */
		if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
		    current_id - oldest_id > 10000 && oldest_session != NULL) {
			__wt_verbose(session, WT_VERB_TRANSACTION,
			    "old snapshot %" PRIu64
			    " pinned in session %" PRIu32 " [%s]"
			    " with snap_min %" PRIu64 "\n",
			    oldest_id, oldest_session->id,
			    oldest_session->lastop,
			    oldest_session->txn.snap_min);
		}
#endif
	}

done:	__wt_writeunlock(session, txn_global->scan_rwlock);
	return (ret);
}
예제 #14
0
파일: txn_log.c 프로젝트: DINKIN/mongo
/*
 * __wt_txn_checkpoint_log --
 *	Write a log record for a checkpoint operation.
 */
int
__wt_txn_checkpoint_log(
    WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_ITEM(logrec);
	WT_DECL_RET;
	WT_ITEM *ckpt_snapshot, empty;
	WT_LSN *ckpt_lsn;
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	uint8_t *end, *p;
	size_t recsize;
	uint32_t i, rectype;
	const char *fmt;

	conn = S2C(session);
	txn_global = &conn->txn_global;
	txn = &session->txn;
	ckpt_lsn = &txn->ckpt_lsn;

	/*
	 * If this is a file sync, log it unless there is a full checkpoint in
	 * progress.
	 */
	if (!full) {
		if (txn->full_ckpt) {
			if (lsnp != NULL)
				*lsnp = *ckpt_lsn;
			return (0);
		}
		return (__txn_log_file_sync(session, flags, lsnp));
	}

	switch (flags) {
	case WT_TXN_LOG_CKPT_PREPARE:
		txn->full_ckpt = true;

		if (conn->compat_major >= WT_LOG_V2) {
			/*
			 * Write the system log record containing a checkpoint
			 * start operation.
			 */
			rectype = WT_LOGREC_SYSTEM;
			fmt = WT_UNCHECKED_STRING(I);
			WT_ERR(__wt_struct_size(
			    session, &recsize, fmt, rectype));
			WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));

			WT_ERR(__wt_struct_pack(session,
			    (uint8_t *)logrec->data + logrec->size, recsize,
			    fmt, rectype));
			logrec->size += (uint32_t)recsize;
			WT_ERR(__wt_logop_checkpoint_start_pack(
			    session, logrec));
			WT_ERR(__wt_log_write(session, logrec, ckpt_lsn, 0));
		} else {
			WT_ERR(__wt_log_printf(session,
			    "CHECKPOINT: Starting record"));
			WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true));
		}

		/*
		 * We take and immediately release the visibility lock.
		 * Acquiring the write lock guarantees that any transaction
		 * that has written to the log has also made its transaction
		 * visible at this time.
		 */
		__wt_writelock(session, &txn_global->visibility_rwlock);
		__wt_writeunlock(session, &txn_global->visibility_rwlock);

		/*
		 * We need to make sure that the log records in the checkpoint
		 * LSN are on disk.  In particular to make sure that the
		 * current log file exists.
		 */
		WT_ERR(__wt_log_force_sync(session, ckpt_lsn));
		break;
	case WT_TXN_LOG_CKPT_START:
		/* Take a copy of the transaction snapshot. */
		txn->ckpt_nsnapshot = txn->snapshot_count;
		recsize = (size_t)txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE;
		WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot));
		p = txn->ckpt_snapshot->mem;
		end = p + recsize;
		for (i = 0; i < txn->snapshot_count; i++)
			WT_ERR(__wt_vpack_uint(
			    &p, WT_PTRDIFF(end, p), txn->snapshot[i]));
		break;
	case WT_TXN_LOG_CKPT_STOP:
		/*
		 * During a clean connection close, we get here without the
		 * prepare or start steps.  In that case, log the current LSN
		 * as the checkpoint LSN.
		 */
		if (!txn->full_ckpt) {
			txn->ckpt_nsnapshot = 0;
			WT_CLEAR(empty);
			ckpt_snapshot = &empty;
			WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true));
		} else
			ckpt_snapshot = txn->ckpt_snapshot;

		/* Write the checkpoint log record. */
		rectype = WT_LOGREC_CHECKPOINT;
		fmt = WT_UNCHECKED_STRING(IIIIu);
		WT_ERR(__wt_struct_size(session, &recsize,
		    fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset,
		    txn->ckpt_nsnapshot, ckpt_snapshot));
		WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));

		WT_ERR(__wt_struct_pack(session,
		    (uint8_t *)logrec->data + logrec->size, recsize,
		    fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset,
		    txn->ckpt_nsnapshot, ckpt_snapshot));
		logrec->size += (uint32_t)recsize;
		WT_ERR(__wt_log_write(session, logrec, lsnp,
		    F_ISSET(conn, WT_CONN_CKPT_SYNC) ?
		    WT_LOG_FSYNC : 0));

		/*
		 * If this full checkpoint completed successfully and there is
		 * no hot backup in progress and this is not an unclean
		 * recovery, tell the logging subsystem the checkpoint LSN so
		 * that it can archive.  Do not update the logging checkpoint
		 * LSN if this is during a clean connection close, only during
		 * a full checkpoint.  A clean close may not update any
		 * metadata LSN and we do not want to archive in that case.
		 */
		if (!conn->hot_backup &&
		    (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY) ||
		    FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) &&
		    txn->full_ckpt)
			__wt_log_ckpt(session, ckpt_lsn);

		/* FALLTHROUGH */
	case WT_TXN_LOG_CKPT_CLEANUP:
		/* Cleanup any allocated resources */
		WT_INIT_LSN(ckpt_lsn);
		txn->ckpt_nsnapshot = 0;
		__wt_scr_free(session, &txn->ckpt_snapshot);
		txn->full_ckpt = false;
		break;
	WT_ILLEGAL_VALUE_ERR(session);
	}

err:	__wt_logrec_free(session, &logrec);
	return (ret);
}
예제 #15
0
/*
 * __conn_dhandle_get --
 *	Allocate a new data handle, lock it exclusively, and return it linked
 *	into the connection's list.
 */
static int
__conn_dhandle_get(WT_SESSION_IMPL *session,
    const char *name, const char *ckpt, uint32_t flags)
{
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_DATA_HANDLE *dhandle;
	WT_DECL_RET;
	uint32_t bucket;

	conn = S2C(session);

	/*
	 * We have the handle lock, check whether we can find the handle we
	 * are looking for.  If we do, and we can lock it in the state we
	 * want, this session will take ownership and we are done.
	 */
	ret = __wt_conn_dhandle_find(session, name, ckpt, flags);
	if (ret == 0) {
		dhandle = session->dhandle;
		WT_RET(__conn_dhandle_open_lock(session, dhandle, flags));
		return (0);
	}
	WT_RET_NOTFOUND_OK(ret);

	/*
	 * If no handle was found, allocate the data handle and a btree handle,
	 * then initialize the data handle.  Exclusively lock the data handle
	 * before inserting it in the list.
	 */
	WT_RET(__wt_calloc_one(session, &dhandle));

	WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle"));

	dhandle->name_hash = __wt_hash_city64(name, strlen(name));
	WT_ERR(__wt_strdup(session, name, &dhandle->name));
	if (ckpt != NULL)
		WT_ERR(__wt_strdup(session, ckpt, &dhandle->checkpoint));

	WT_ERR(__wt_calloc_one(session, &btree));
	dhandle->handle = btree;
	btree->dhandle = dhandle;

	WT_ERR(__wt_spin_init(
	    session, &dhandle->close_lock, "data handle close"));

	F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
	WT_ERR(__wt_writelock(session, dhandle->rwlock));

	/*
	 * Prepend the handle to the connection list, assuming we're likely to
	 * need new files again soon, until they are cached by all sessions.
	 * Find the right hash bucket to insert into as well.
	 */
	WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED));
	bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE;
	WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket);

	session->dhandle = dhandle;
	return (0);

err:	WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock));
	__wt_free(session, dhandle->name);
	__wt_free(session, dhandle->checkpoint);
	__wt_free(session, dhandle->handle);		/* btree free */
	__wt_spin_destroy(session, &dhandle->close_lock);
	__wt_overwrite_and_free(session, dhandle);

	return (ret);
}
예제 #16
0
파일: txn.c 프로젝트: bsamek/wiredtiger
/*
 * __wt_txn_commit --
 *	Commit the current transaction.
 */
int
__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_OP *op;
	u_int i;
	bool locked, readonly;
#ifdef HAVE_TIMESTAMPS
	wt_timestamp_t prev_commit_timestamp, ts;
	bool update_timestamp;
#endif

	txn = &session->txn;
	conn = S2C(session);
	txn_global = &conn->txn_global;
	locked = false;

	WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) ||
	    txn->mod_count == 0);

	readonly = txn->mod_count == 0;
	/*
	 * Look for a commit timestamp.
	 */
	WT_ERR(
	    __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval));
	if (cval.len != 0) {
#ifdef HAVE_TIMESTAMPS
		WT_ERR(__wt_txn_parse_timestamp(session, "commit", &ts, &cval));
		WT_ERR(__wt_timestamp_validate(session,
		    "commit", &ts, &cval, true, true, true));
		__wt_timestamp_set(&txn->commit_timestamp, &ts);
		__wt_txn_set_commit_timestamp(session);
#else
		WT_ERR_MSG(session, EINVAL, "commit_timestamp requires a "
		    "version of WiredTiger built with timestamp support");
#endif
	}

#ifdef HAVE_TIMESTAMPS
	/*
	 * Debugging checks on timestamps, if user requested them.
	 */
	if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) &&
	    !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
	    txn->mod_count != 0)
		WT_ERR_MSG(session, EINVAL, "commit_timestamp required and "
		    "none set on this transaction");
	if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) &&
	    F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
	    txn->mod_count != 0)
		WT_ERR_MSG(session, EINVAL, "no commit_timestamp required and "
		    "timestamp set on this transaction");
#endif
	/*
	 * The default sync setting is inherited from the connection, but can
	 * be overridden by an explicit "sync" setting for this transaction.
	 */
	WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval));

	/*
	 * If the user chose the default setting, check whether sync is enabled
	 * for this transaction (either inherited or via begin_transaction).
	 * If sync is disabled, clear the field to avoid the log write being
	 * flushed.
	 *
	 * Otherwise check for specific settings.  We don't need to check for
	 * "on" because that is the default inherited from the connection.  If
	 * the user set anything in begin_transaction, we only override with an
	 * explicit setting.
	 */
	if (cval.len == 0) {
		if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) &&
		    !F_ISSET(txn, WT_TXN_SYNC_SET))
			txn->txn_logsync = 0;
	} else {
		/*
		 * If the caller already set sync on begin_transaction then
		 * they should not be using sync on commit_transaction.
		 * Flag that as an error.
		 */
		if (F_ISSET(txn, WT_TXN_SYNC_SET))
			WT_ERR_MSG(session, EINVAL,
			    "Sync already set during begin_transaction");
		if (WT_STRING_MATCH("background", cval.str, cval.len))
			txn->txn_logsync = WT_LOG_BACKGROUND;
		else if (WT_STRING_MATCH("off", cval.str, cval.len))
			txn->txn_logsync = 0;
		/*
		 * We don't need to check for "on" here because that is the
		 * default to inherit from the connection setting.
		 */
	}

	/* Commit notification. */
	if (txn->notify != NULL)
		WT_ERR(txn->notify->notify(txn->notify,
		    (WT_SESSION *)session, txn->id, 1));

	/*
	 * We are about to release the snapshot: copy values into any
	 * positioned cursors so they don't point to updates that could be
	 * freed once we don't have a snapshot.
	 */
	if (session->ncursors > 0) {
		WT_DIAGNOSTIC_YIELD;
		WT_ERR(__wt_session_copy_values(session));
	}

	/* If we are logging, write a commit log record. */
	if (txn->logrec != NULL &&
	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
	    !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
		/*
		 * We are about to block on I/O writing the log.
		 * Release our snapshot in case it is keeping data pinned.
		 * This is particularly important for checkpoints.
		 */
		__wt_txn_release_snapshot(session);
		/*
		 * We hold the visibility lock for reading from the time
		 * we write our log record until the time we release our
		 * transaction so that the LSN any checkpoint gets will
		 * always reflect visible data.
		 */
		__wt_readlock(session, &txn_global->visibility_rwlock);
		locked = true;
		WT_ERR(__wt_txn_log_commit(session, cfg));
	}

	/* Note: we're going to commit: nothing can fail after this point. */

	/* Process and free updates. */
	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
		switch (op->type) {
		case WT_TXN_OP_BASIC:
		case WT_TXN_OP_BASIC_TS:
		case WT_TXN_OP_INMEM:
			/*
			 * Switch reserved operations to abort to
			 * simplify obsolete update list truncation.
			 */
			if (op->u.upd->type == WT_UPDATE_RESERVED) {
				op->u.upd->txnid = WT_TXN_ABORTED;
				break;
			}

			/*
			 * Writes to the lookaside file can be evicted as soon
			 * as they commit.
			 */
			if (conn->cache->las_fileid != 0 &&
			    op->fileid == conn->cache->las_fileid) {
				op->u.upd->txnid = WT_TXN_NONE;
				break;
			}

#ifdef HAVE_TIMESTAMPS
			if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
			    op->type != WT_TXN_OP_BASIC_TS) {
				WT_ASSERT(session,
				    op->fileid != WT_METAFILE_ID);
				__wt_timestamp_set(&op->u.upd->timestamp,
				    &txn->commit_timestamp);
			}
#endif
			break;

		case WT_TXN_OP_REF:
#ifdef HAVE_TIMESTAMPS
			if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
				__wt_timestamp_set(
				    &op->u.ref->page_del->timestamp,
				    &txn->commit_timestamp);
#endif
			break;

		case WT_TXN_OP_TRUNCATE_COL:
		case WT_TXN_OP_TRUNCATE_ROW:
			/* Other operations don't need timestamps. */
			break;
		}

		__wt_txn_op_free(session, op);
	}
	txn->mod_count = 0;

#ifdef HAVE_TIMESTAMPS
	/*
	 * Track the largest commit timestamp we have seen.
	 *
	 * We don't actually clear the local commit timestamp, just the flag.
	 * That said, we can't update the global commit timestamp until this
	 * transaction is visible, which happens when we release it.
	 */
	update_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_COMMIT);
#endif

	__wt_txn_release(session);
	if (locked)
		__wt_readunlock(session, &txn_global->visibility_rwlock);

#ifdef HAVE_TIMESTAMPS
	/* First check if we've already committed something in the future. */
	if (update_timestamp) {
		WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
		    __wt_timestamp_set(
			&prev_commit_timestamp, &txn_global->commit_timestamp));
		update_timestamp = __wt_timestamp_cmp(
		    &txn->commit_timestamp, &prev_commit_timestamp) > 0;
	}

	/*
	 * If it looks like we need to move the global commit timestamp,
	 * write lock and re-check.
	 */
	if (update_timestamp) {
#if WT_TIMESTAMP_SIZE == 8
		while (__wt_timestamp_cmp(
		    &txn->commit_timestamp, &prev_commit_timestamp) > 0) {
			if (__wt_atomic_cas64(
			    &txn_global->commit_timestamp.val,
			    prev_commit_timestamp.val,
			    txn->commit_timestamp.val)) {
				txn_global->has_commit_timestamp = true;
				break;
			}
		    __wt_timestamp_set(
			&prev_commit_timestamp, &txn_global->commit_timestamp);
		}
#else
		__wt_writelock(session, &txn_global->rwlock);
		if (__wt_timestamp_cmp(&txn->commit_timestamp,
		    &txn_global->commit_timestamp) > 0) {
			__wt_timestamp_set(&txn_global->commit_timestamp,
			    &txn->commit_timestamp);
			txn_global->has_commit_timestamp = true;
		}
		__wt_writeunlock(session, &txn_global->rwlock);
#endif
	}
#endif

	/*
	 * We're between transactions, if we need to block for eviction, it's
	 * a good time to do so.  Note that we must ignore any error return
	 * because the user's data is committed.
	 */
	if (!readonly)
		(void)__wt_cache_eviction_check(session, false, false, NULL);
	return (0);

err:	/*
	 * If anything went wrong, roll back.
	 *
	 * !!!
	 * Nothing can fail after this point.
	 */
	if (locked)
		__wt_readunlock(session, &txn_global->visibility_rwlock);
	WT_TRET(__wt_txn_rollback(session, cfg));
	return (ret);
}
예제 #17
0
파일: cur_backup.c 프로젝트: DINKIN/mongo
/*
 * __backup_start --
 *	Start a backup.
 */
static int
__backup_start(
    WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[])
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FSTREAM *srcfs;
	const char *dest;
	bool exist, log_only, target_list;

	conn = S2C(session);
	srcfs = NULL;
	dest = NULL;

	cb->next = 0;
	cb->list = NULL;
	cb->list_next = 0;

	WT_RET(__wt_inmem_unsupported_op(session, "backup cursor"));

	/*
	 * Single thread hot backups: we're holding the schema lock, so we
	 * know we'll serialize with other attempts to start a hot backup.
	 */
	if (conn->hot_backup)
		WT_RET_MSG(
		    session, EINVAL, "there is already a backup cursor open");

	/*
	 * The hot backup copy is done outside of WiredTiger, which means file
	 * blocks can't be freed and re-allocated until the backup completes.
	 * The checkpoint code checks the backup flag, and if a backup cursor
	 * is open checkpoints aren't discarded. We release the lock as soon
	 * as we've set the flag, we don't want to block checkpoints, we just
	 * want to make sure no checkpoints are deleted.  The checkpoint code
	 * holds the lock until it's finished the checkpoint, otherwise we
	 * could start a hot backup that would race with an already-started
	 * checkpoint.
	 *
	 * We are holding the checkpoint and schema locks so schema operations
	 * will not see the backup file list until it is complete and valid.
	 */
	__wt_writelock(session, &conn->hot_backup_lock);
	conn->hot_backup = true;
	conn->hot_backup_list = NULL;
	__wt_writeunlock(session, &conn->hot_backup_lock);

	/* We're the lock holder, we own cleanup. */
	F_SET(cb, WT_CURBACKUP_LOCKER);

	/*
	 * Create a temporary backup file.  This must be opened before
	 * generating the list of targets in backup_uri.  This file will
	 * later be renamed to the correct name depending on whether or not
	 * we're doing an incremental backup.  We need a temp file so that if
	 * we fail or crash while filling it, the existence of a partial file
	 * doesn't confuse restarting in the source database.
	 */
	WT_ERR(__wt_fopen(session, WT_BACKUP_TMP,
	    WT_FS_OPEN_CREATE, WT_STREAM_WRITE, &cb->bfs));
	/*
	 * If a list of targets was specified, work our way through them.
	 * Else, generate a list of all database objects.
	 *
	 * Include log files if doing a full backup, and copy them before
	 * copying data files to avoid rolling the metadata forward across
	 * a checkpoint that completes during the backup.
	 */
	target_list = false;
	WT_ERR(__backup_uri(session, cfg, &target_list, &log_only));

	if (!target_list) {
		WT_ERR(__backup_log_append(session, cb, true));
		WT_ERR(__backup_all(session));
	}

	/* Add the hot backup and standard WiredTiger files to the list. */
	if (log_only) {
		/*
		 * We also open an incremental backup source file so that we
		 * can detect a crash with an incremental backup existing in
		 * the source directory versus an improper destination.
		 */
		dest = WT_INCREMENTAL_BACKUP;
		WT_ERR(__wt_fopen(session, WT_INCREMENTAL_SRC,
		    WT_FS_OPEN_CREATE, WT_STREAM_WRITE, &srcfs));
		WT_ERR(__backup_list_append(
		    session, cb, WT_INCREMENTAL_BACKUP));
	} else {
		dest = WT_METADATA_BACKUP;
		WT_ERR(__backup_list_append(session, cb, WT_METADATA_BACKUP));
		WT_ERR(__wt_fs_exist(session, WT_BASECONFIG, &exist));
		if (exist)
			WT_ERR(__backup_list_append(
			    session, cb, WT_BASECONFIG));
		WT_ERR(__wt_fs_exist(session, WT_USERCONFIG, &exist));
		if (exist)
			WT_ERR(__backup_list_append(
			    session, cb, WT_USERCONFIG));
		WT_ERR(__backup_list_append(session, cb, WT_WIREDTIGER));
	}

err:	/* Close the hot backup file. */
	WT_TRET(__wt_fclose(session, &cb->bfs));
	if (srcfs != NULL)
		WT_TRET(__wt_fclose(session, &srcfs));
	if (ret == 0) {
		WT_ASSERT(session, dest != NULL);
		WT_TRET(__wt_fs_rename(session, WT_BACKUP_TMP, dest, false));
		__wt_writelock(session, &conn->hot_backup_lock);
		conn->hot_backup_list = cb->list;
		__wt_writeunlock(session, &conn->hot_backup_lock);
	}

	return (ret);
}
예제 #18
0
/*
 * __backup_start --
 *	Start a backup.
 */
static int
__backup_start(
    WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[])
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	bool exist, log_only, target_list;

	conn = S2C(session);

	cb->next = 0;
	cb->list = NULL;
	cb->list_next = 0;

	/*
	 * Single thread hot backups: we're holding the schema lock, so we
	 * know we'll serialize with other attempts to start a hot backup.
	 */
	if (conn->hot_backup)
		WT_RET_MSG(
		    session, EINVAL, "there is already a backup cursor open");

	/*
	 * The hot backup copy is done outside of WiredTiger, which means file
	 * blocks can't be freed and re-allocated until the backup completes.
	 * The checkpoint code checks the backup flag, and if a backup cursor
	 * is open checkpoints aren't discarded. We release the lock as soon
	 * as we've set the flag, we don't want to block checkpoints, we just
	 * want to make sure no checkpoints are deleted.  The checkpoint code
	 * holds the lock until it's finished the checkpoint, otherwise we
	 * could start a hot backup that would race with an already-started
	 * checkpoint.
	 */
	WT_RET(__wt_writelock(session, conn->hot_backup_lock));
	conn->hot_backup = true;
	WT_ERR(__wt_writeunlock(session, conn->hot_backup_lock));

	/* Create the hot backup file. */
	WT_ERR(__backup_file_create(session, cb, false));

	/* Add log files if logging is enabled. */

	/*
	 * If a list of targets was specified, work our way through them.
	 * Else, generate a list of all database objects.
	 *
	 * Include log files if doing a full backup, and copy them before
	 * copying data files to avoid rolling the metadata forward across
	 * a checkpoint that completes during the backup.
	 */
	target_list = false;
	WT_ERR(__backup_uri(session, cfg, &target_list, &log_only));

	if (!target_list) {
		WT_ERR(__backup_log_append(session, cb, true));
		WT_ERR(__backup_all(session));
	}

	/* Add the hot backup and standard WiredTiger files to the list. */
	if (log_only) {
		/*
		 * Close any hot backup file.
		 * We're about to open the incremental backup file.
		 */
		WT_TRET(__wt_fclose(&cb->bfp, WT_FHANDLE_WRITE));
		WT_ERR(__backup_file_create(session, cb, log_only));
		WT_ERR(__backup_list_append(
		    session, cb, WT_INCREMENTAL_BACKUP));
	} else {
		WT_ERR(__backup_list_append(session, cb, WT_METADATA_BACKUP));
		WT_ERR(__wt_exist(session, WT_BASECONFIG, &exist));
		if (exist)
			WT_ERR(__backup_list_append(
			    session, cb, WT_BASECONFIG));
		WT_ERR(__wt_exist(session, WT_USERCONFIG, &exist));
		if (exist)
			WT_ERR(__backup_list_append(
			    session, cb, WT_USERCONFIG));
		WT_ERR(__backup_list_append(session, cb, WT_WIREDTIGER));
	}

err:	/* Close the hot backup file. */
	WT_TRET(__wt_fclose(&cb->bfp, WT_FHANDLE_WRITE));
	if (ret != 0) {
		WT_TRET(__backup_cleanup_handles(session, cb));
		WT_TRET(__backup_stop(session));
	}

	return (ret);
}
예제 #19
0
파일: bt_compact.c 프로젝트: ksuarz/mongo
/*
 * __compact_rewrite --
 *	Return if a page needs to be re-written.
 */
static int
__compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
	WT_BM *bm;
	WT_DECL_RET;
	WT_MULTI *multi;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	size_t addr_size;
	uint32_t i;
	const uint8_t *addr;

	*skipp = true;					/* Default skip. */

	bm = S2BT(session)->bm;
	page = ref->page;
	mod = page->modify;

	/*
	 * Ignore the root: it may not have a replacement address, and besides,
	 * if anything else gets written, so will it.
	 */
	if (__wt_ref_is_root(ref))
		return (0);

	/* Ignore currently dirty pages, they will be written regardless. */
	if (__wt_page_is_modified(page))
		return (0);

	/*
	 * If the page is clean, test the original addresses.
	 * If the page is a replacement, test the replacement addresses.
	 * Ignore empty pages, they get merged into the parent.
	 */
	if (mod == NULL || mod->rec_result == 0) {
		__wt_ref_info(ref, &addr, &addr_size, NULL);
		if (addr == NULL)
			return (0);
		return (
		    bm->compact_page_skip(bm, session, addr, addr_size, skipp));
	}

	/*
	 * The page's modification information can change underfoot if the page
	 * is being reconciled, serialize with reconciliation.
	 */
	if (mod->rec_result == WT_PM_REC_REPLACE ||
	    mod->rec_result == WT_PM_REC_MULTIBLOCK)
		__wt_writelock(session, &page->page_lock);

	if (mod->rec_result == WT_PM_REC_REPLACE)
		ret = bm->compact_page_skip(bm, session,
		    mod->mod_replace.addr, mod->mod_replace.size, skipp);

	if (mod->rec_result == WT_PM_REC_MULTIBLOCK)
		for (multi = mod->mod_multi,
		    i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
			if (multi->disk_image != NULL)
				continue;
			if ((ret = bm->compact_page_skip(bm, session,
			    multi->addr.addr, multi->addr.size, skipp)) != 0)
				break;
			if (!*skipp)
				break;
		}

	if (mod->rec_result == WT_PM_REC_REPLACE ||
	    mod->rec_result == WT_PM_REC_MULTIBLOCK)
		__wt_writeunlock(session, &page->page_lock);

	return (ret);
}