Example #1
0
/*
 * __txn_abort_newer_update --
 *	Abort updates in an update change with timestamps newer than the
 *	rollback timestamp.
 */
static void
__txn_abort_newer_update(WT_SESSION_IMPL *session,
    WT_UPDATE *upd, wt_timestamp_t *rollback_timestamp)
{
	WT_UPDATE *next_upd;
	bool aborted_one;

	aborted_one = false;
	for (next_upd = upd; next_upd != NULL; next_upd = next_upd->next) {
		/*
		 * Updates with no timestamp will have a timestamp of zero
		 * which will fail the following check and cause them to never
		 * be aborted.
		 */
		if (__wt_timestamp_cmp(
		    rollback_timestamp, &next_upd->timestamp) < 0) {
			next_upd->txnid = WT_TXN_ABORTED;
			__wt_timestamp_set_zero(&next_upd->timestamp);

			/*
			* If any updates are aborted, all newer updates
			* better be aborted as well.
			*/
			if (!aborted_one)
				WT_ASSERT(session,
				    !aborted_one || upd == next_upd);
			aborted_one = true;
		}
	}
}
Example #2
0
/*
 * __txn_rollback_to_stable_btree_walk --
 *	Called for each open handle - choose to either skip or wipe the commits
 */
static int
__txn_rollback_to_stable_btree_walk(
    WT_SESSION_IMPL *session, wt_timestamp_t *rollback_timestamp)
{
	WT_DECL_RET;
	WT_PAGE *page;
	WT_REF *ref;

	/* Walk the tree, marking commits aborted where appropriate. */
	ref = NULL;
	while ((ret = __wt_tree_walk_custom_skip(session, &ref,
	    __txn_rollback_to_stable_custom_skip,
	    NULL, WT_READ_NO_EVICT)) == 0 && ref != NULL) {
		page = ref->page;

		/* Review deleted page saved to the ref */
		if (ref->page_del != NULL && __wt_timestamp_cmp(
		    rollback_timestamp, &ref->page_del->timestamp) < 0)
			__wt_delete_page_rollback(session, ref);

		if (!__wt_page_is_modified(page))
			continue;

		WT_RET(__txn_abort_newer_updates(
		    session, ref, rollback_timestamp));
	}
	return (ret);
}
Example #3
0
/*
 * __sync_file --
 *	Flush pages for a specific file.
 */
static int
__sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
{
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_REF *prev, *walk;
	WT_TXN *txn;
	uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
	uint64_t oldest_id, saved_pinned_id, time_start, time_stop;
	uint32_t flags;
	bool timer, tried_eviction;

	conn = S2C(session);
	btree = S2BT(session);
	prev = walk = NULL;
	txn = &session->txn;
	tried_eviction = false;
	time_start = time_stop = 0;

	/* Only visit pages in cache and don't bump page read generations. */
	flags = WT_READ_CACHE | WT_READ_NO_GEN;

	/*
	 * Skip all deleted pages.  For a page to be marked deleted, it must
	 * have been evicted from cache and marked clean.  Checkpoint should
	 * never instantiate deleted pages: if a truncate is not visible to the
	 * checkpoint, the on-disk version is correct.  If the truncate is
	 * visible, we skip over the child page when writing its parent.  We
	 * check whether a truncate is visible in the checkpoint as part of
	 * reconciling internal pages (specifically in __rec_child_modify).
	 */
	LF_SET(WT_READ_DELETED_SKIP);

	internal_bytes = leaf_bytes = 0;
	internal_pages = leaf_pages = 0;
	saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
	timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT);
	if (timer)
		time_start = __wt_clock(session);

	switch (syncop) {
	case WT_SYNC_WRITE_LEAVES:
		/*
		 * Write all immediately available, dirty in-cache leaf pages.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.
		 */
		if (!btree->modified)
			return (0);
		__wt_spin_lock(session, &btree->flush_lock);
		if (!btree->modified) {
			__wt_spin_unlock(session, &btree->flush_lock);
			return (0);
		}

		/*
		 * Save the oldest transaction ID we need to keep around.
		 * Otherwise, in a busy system, we could be updating pages so
		 * fast that write leaves never catches up.  We deliberately
		 * have no transaction running at this point that would keep
		 * the oldest ID from moving forwards as we walk the tree.
		 */
		oldest_id = __wt_txn_oldest_id(session);

		LF_SET(WT_READ_NO_WAIT | WT_READ_SKIP_INTL);
		for (;;) {
			WT_ERR(__wt_tree_walk(session, &walk, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages if nobody beat us to it.  Don't
			 * try to write hot pages (defined as pages that have
			 * been updated since the write phase leaves started):
			 * checkpoint will have to visit them anyway.
			 */
			page = walk->page;
			if (__wt_page_is_modified(page) &&
			    WT_TXNID_LT(page->modify->update_txn, oldest_id)) {
				if (txn->isolation == WT_ISO_READ_COMMITTED)
					__wt_txn_get_snapshot(session);
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
				WT_ERR(__wt_reconcile(session,
				    walk, NULL, WT_REC_CHECKPOINT, NULL));
			}
		}
		break;
	case WT_SYNC_CHECKPOINT:
		/*
		 * If we are flushing a file at read-committed isolation, which
		 * is of particular interest for flushing the metadata to make
		 * a schema-changing operation durable, get a transactional
		 * snapshot now.
		 *
		 * All changes committed up to this point should be included.
		 * We don't update the snapshot in between pages because the
		 * metadata shouldn't have many pages.  Instead, read-committed
		 * isolation ensures that all metadata updates completed before
		 * the checkpoint are included.
		 */
		if (txn->isolation == WT_ISO_READ_COMMITTED)
			__wt_txn_get_snapshot(session);

		/*
		 * We cannot check the tree modified flag in the case of a
		 * checkpoint, the checkpoint code has already cleared it.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.  We're holding the schema lock, but need the
		 * lower-level lock as well.
		 */
		__wt_spin_lock(session, &btree->flush_lock);

		/*
		 * In the final checkpoint pass, child pages cannot be evicted
		 * from underneath internal pages nor can underlying blocks be
		 * freed until the checkpoint's block lists are stable. Also,
		 * we cannot split child pages into parents unless we know the
		 * final pass will write a consistent view of that namespace.
		 * Set the checkpointing flag to block such actions and wait for
		 * any problematic eviction or page splits to complete.
		 */
		WT_ASSERT(session, btree->syncing == WT_BTREE_SYNC_OFF &&
		    btree->sync_session == NULL);

		btree->sync_session = session;
		btree->syncing = WT_BTREE_SYNC_WAIT;
		(void)__wt_gen_next_drain(session, WT_GEN_EVICT);
		btree->syncing = WT_BTREE_SYNC_RUNNING;

		/* Write all dirty in-cache pages. */
		LF_SET(WT_READ_NO_EVICT);

		/* Read pages with lookaside entries and evict them asap. */
		LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED);

		for (;;) {
			WT_ERR(__sync_dup_walk(session, walk, flags, &prev));
			WT_ERR(__wt_tree_walk(session, &walk, flags));

			if (walk == NULL)
				break;

			/*
			 * Skip clean pages, but need to make sure maximum
			 * transaction ID is always updated.
			 */
			if (!__wt_page_is_modified(walk->page)) {
				if (((mod = walk->page->modify) != NULL) &&
				    mod->rec_max_txn > btree->rec_max_txn)
					btree->rec_max_txn = mod->rec_max_txn;
#ifdef HAVE_TIMESTAMPS
				if (mod != NULL && __wt_timestamp_cmp(
				    &btree->rec_max_timestamp,
				    &mod->rec_max_timestamp) < 0)
					__wt_timestamp_set(
					    &btree->rec_max_timestamp,
					    &mod->rec_max_timestamp);
#endif
				continue;
			}

			/*
			 * Take a local reference to the page modify structure
			 * now that we know the page is dirty. It needs to be
			 * done in this order otherwise the page modify
			 * structure could have been created between taking the
			 * reference and checking modified.
			 */
			page = walk->page;

			/*
			 * Write dirty pages, if we can't skip them. If we skip
			 * a page, mark the tree dirty. The checkpoint marked it
			 * clean and we can't skip future checkpoints until this
			 * page is written.
			 */
			if (__sync_checkpoint_can_skip(session, page)) {
				__wt_tree_modify_set(session);
				continue;
			}

			if (WT_PAGE_IS_INTERNAL(page)) {
				internal_bytes += page->memory_footprint;
				++internal_pages;
			} else {
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
			}

			/*
			 * If the page was pulled into cache by our read, try
			 * to evict it now.
			 *
			 * For eviction to have a chance, we first need to move
			 * the walk point to the next page checkpoint will
			 * visit.  We want to avoid this code being too special
			 * purpose, so try to reuse the ordinary eviction path.
			 *
			 * Regardless of whether eviction succeeds or fails,
			 * the walk continues from the previous location.  We
			 * remember whether we tried eviction, and don't try
			 * again.  Even if eviction fails (the page may stay in
			 * cache clean but with history that cannot be
			 * discarded), that is not wasted effort because
			 * checkpoint doesn't need to write the page again.
			 */
			if (!WT_PAGE_IS_INTERNAL(page) &&
			    page->read_gen == WT_READGEN_WONT_NEED &&
			    !tried_eviction) {
				WT_ERR_BUSY_OK(
				    __wt_page_release_evict(session, walk));
				walk = prev;
				prev = NULL;
				tried_eviction = true;
				continue;
			}
			tried_eviction = false;

			WT_ERR(__wt_reconcile(
			    session, walk, NULL, WT_REC_CHECKPOINT, NULL));

			/*
			 * Update checkpoint IO tracking data if configured
			 * to log verbose progress messages.
			 */
			if (conn->ckpt_timer_start.tv_sec > 0) {
				conn->ckpt_write_bytes +=
				    page->memory_footprint;
				++conn->ckpt_write_pages;

				/* Periodically log checkpoint progress. */
				if (conn->ckpt_write_pages % 5000 == 0)
					__wt_checkpoint_progress(
					    session, false);
			}
		}
		break;
	case WT_SYNC_CLOSE:
	case WT_SYNC_DISCARD:
		WT_ERR(__wt_illegal_value(session, syncop));
		break;
	}

	if (timer) {
		time_stop = __wt_clock(session);
		__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "__sync_file WT_SYNC_%s wrote: %" PRIu64
		    " leaf pages (%" PRIu64 "B), %" PRIu64
		    " internal pages (%" PRIu64 "B), and took %" PRIu64 "ms",
		    syncop == WT_SYNC_WRITE_LEAVES ?
		    "WRITE_LEAVES" : "CHECKPOINT",
		    leaf_pages, leaf_bytes, internal_pages, internal_bytes,
		    WT_CLOCKDIFF_MS(time_stop, time_start));
	}

err:	/* On error, clear any left-over tree walk. */
	WT_TRET(__wt_page_release(session, walk, flags));
	WT_TRET(__wt_page_release(session, prev, flags));

	/*
	 * If we got a snapshot in order to write pages, and there was no
	 * snapshot active when we started, release it.
	 */
	if (txn->isolation == WT_ISO_READ_COMMITTED &&
	    saved_pinned_id == WT_TXN_NONE)
		__wt_txn_release_snapshot(session);

	/* Clear the checkpoint flag. */
	btree->syncing = WT_BTREE_SYNC_OFF;
	btree->sync_session = NULL;

	__wt_spin_unlock(session, &btree->flush_lock);

	/*
	 * Leaves are written before a checkpoint (or as part of a file close,
	 * before checkpointing the file).  Start a flush to stable storage,
	 * but don't wait for it.
	 */
	if (ret == 0 &&
	    syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC))
		WT_RET(btree->bm->sync(btree->bm, session, false));

	return (ret);
}
Example #4
0
/*
 * __txn_rollback_to_stable_lookaside_fixup --
 *	Remove any updates that need to be rolled back from the lookaside file.
 */
static int
__txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_DECL_TIMESTAMP(rollback_timestamp)
	WT_ITEM las_addr, las_key, las_timestamp;
	WT_TXN_GLOBAL *txn_global;
	uint64_t las_counter, las_txnid, remove_cnt;
	uint32_t las_id, session_flags;

	conn = S2C(session);
	cursor = NULL;
	remove_cnt = 0;
	session_flags = 0;		/* [-Werror=maybe-uninitialized] */
	WT_CLEAR(las_timestamp);

	/*
	 * Copy the stable timestamp, otherwise we'd need to lock it each time
	 * it's accessed. Even though the stable timestamp isn't supposed to be
	 * updated while rolling back, accessing it without a lock would
	 * violate protocol.
	 */
	txn_global = &S2C(session)->txn_global;
	__wt_readlock(session, &txn_global->rwlock);
	__wt_timestamp_set(&rollback_timestamp, &txn_global->stable_timestamp);
	__wt_readunlock(session, &txn_global->rwlock);

	__wt_las_cursor(session, &cursor, &session_flags);

	/* Discard pages we read as soon as we're done with them. */
	F_SET(session, WT_SESSION_NO_CACHE);

	/* Walk the file. */
	for (; (ret = cursor->next(cursor)) == 0; ) {
		WT_ERR(cursor->get_key(cursor, &las_id, &las_addr, &las_counter,
		    &las_txnid, &las_timestamp, &las_key));

		/* Check the file ID so we can skip durable tables */
		if (__bit_test(conn->stable_rollback_bitstring, las_id))
			continue;

		/*
		 * Entries with no timestamp will have a timestamp of zero,
		 * which will fail the following check and cause them to never
		 * be removed.
		 */
		if (__wt_timestamp_cmp(
		    &rollback_timestamp, las_timestamp.data) < 0) {
			WT_ERR(cursor->remove(cursor));
			++remove_cnt;
		}
	}
	WT_ERR_NOTFOUND_OK(ret);
err:	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
	/*
	 * If there were races to remove records, we can over-count. Underflow
	 * isn't fatal, but check anyway so we don't skew low over time.
	 */
	if (remove_cnt > conn->las_record_cnt)
		conn->las_record_cnt = 0;
	else if (remove_cnt > 0)
		(void)__wt_atomic_sub64(&conn->las_record_cnt, remove_cnt);

	F_CLR(session, WT_SESSION_NO_CACHE);

	return (ret);
}
Example #5
0
/*
 * __wt_txn_commit --
 *	Commit the current transaction.
 */
int
__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_OP *op;
	u_int i;
	bool locked, readonly;
#ifdef HAVE_TIMESTAMPS
	wt_timestamp_t prev_commit_timestamp, ts;
	bool update_timestamp;
#endif

	txn = &session->txn;
	conn = S2C(session);
	txn_global = &conn->txn_global;
	locked = false;

	WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
	WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) ||
	    txn->mod_count == 0);

	readonly = txn->mod_count == 0;
	/*
	 * Look for a commit timestamp.
	 */
	WT_ERR(
	    __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval));
	if (cval.len != 0) {
#ifdef HAVE_TIMESTAMPS
		WT_ERR(__wt_txn_parse_timestamp(session, "commit", &ts, &cval));
		WT_ERR(__wt_timestamp_validate(session,
		    "commit", &ts, &cval, true, true, true));
		__wt_timestamp_set(&txn->commit_timestamp, &ts);
		__wt_txn_set_commit_timestamp(session);
#else
		WT_ERR_MSG(session, EINVAL, "commit_timestamp requires a "
		    "version of WiredTiger built with timestamp support");
#endif
	}

#ifdef HAVE_TIMESTAMPS
	/*
	 * Debugging checks on timestamps, if user requested them.
	 */
	if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) &&
	    !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
	    txn->mod_count != 0)
		WT_ERR_MSG(session, EINVAL, "commit_timestamp required and "
		    "none set on this transaction");
	if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) &&
	    F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
	    txn->mod_count != 0)
		WT_ERR_MSG(session, EINVAL, "no commit_timestamp required and "
		    "timestamp set on this transaction");
#endif
	/*
	 * The default sync setting is inherited from the connection, but can
	 * be overridden by an explicit "sync" setting for this transaction.
	 */
	WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval));

	/*
	 * If the user chose the default setting, check whether sync is enabled
	 * for this transaction (either inherited or via begin_transaction).
	 * If sync is disabled, clear the field to avoid the log write being
	 * flushed.
	 *
	 * Otherwise check for specific settings.  We don't need to check for
	 * "on" because that is the default inherited from the connection.  If
	 * the user set anything in begin_transaction, we only override with an
	 * explicit setting.
	 */
	if (cval.len == 0) {
		if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) &&
		    !F_ISSET(txn, WT_TXN_SYNC_SET))
			txn->txn_logsync = 0;
	} else {
		/*
		 * If the caller already set sync on begin_transaction then
		 * they should not be using sync on commit_transaction.
		 * Flag that as an error.
		 */
		if (F_ISSET(txn, WT_TXN_SYNC_SET))
			WT_ERR_MSG(session, EINVAL,
			    "Sync already set during begin_transaction");
		if (WT_STRING_MATCH("background", cval.str, cval.len))
			txn->txn_logsync = WT_LOG_BACKGROUND;
		else if (WT_STRING_MATCH("off", cval.str, cval.len))
			txn->txn_logsync = 0;
		/*
		 * We don't need to check for "on" here because that is the
		 * default to inherit from the connection setting.
		 */
	}

	/* Commit notification. */
	if (txn->notify != NULL)
		WT_ERR(txn->notify->notify(txn->notify,
		    (WT_SESSION *)session, txn->id, 1));

	/*
	 * We are about to release the snapshot: copy values into any
	 * positioned cursors so they don't point to updates that could be
	 * freed once we don't have a snapshot.
	 */
	if (session->ncursors > 0) {
		WT_DIAGNOSTIC_YIELD;
		WT_ERR(__wt_session_copy_values(session));
	}

	/* If we are logging, write a commit log record. */
	if (txn->logrec != NULL &&
	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
	    !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
		/*
		 * We are about to block on I/O writing the log.
		 * Release our snapshot in case it is keeping data pinned.
		 * This is particularly important for checkpoints.
		 */
		__wt_txn_release_snapshot(session);
		/*
		 * We hold the visibility lock for reading from the time
		 * we write our log record until the time we release our
		 * transaction so that the LSN any checkpoint gets will
		 * always reflect visible data.
		 */
		__wt_readlock(session, &txn_global->visibility_rwlock);
		locked = true;
		WT_ERR(__wt_txn_log_commit(session, cfg));
	}

	/* Note: we're going to commit: nothing can fail after this point. */

	/* Process and free updates. */
	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
		switch (op->type) {
		case WT_TXN_OP_BASIC:
		case WT_TXN_OP_BASIC_TS:
		case WT_TXN_OP_INMEM:
			/*
			 * Switch reserved operations to abort to
			 * simplify obsolete update list truncation.
			 */
			if (op->u.upd->type == WT_UPDATE_RESERVED) {
				op->u.upd->txnid = WT_TXN_ABORTED;
				break;
			}

			/*
			 * Writes to the lookaside file can be evicted as soon
			 * as they commit.
			 */
			if (conn->cache->las_fileid != 0 &&
			    op->fileid == conn->cache->las_fileid) {
				op->u.upd->txnid = WT_TXN_NONE;
				break;
			}

#ifdef HAVE_TIMESTAMPS
			if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
			    op->type != WT_TXN_OP_BASIC_TS) {
				WT_ASSERT(session,
				    op->fileid != WT_METAFILE_ID);
				__wt_timestamp_set(&op->u.upd->timestamp,
				    &txn->commit_timestamp);
			}
#endif
			break;

		case WT_TXN_OP_REF:
#ifdef HAVE_TIMESTAMPS
			if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
				__wt_timestamp_set(
				    &op->u.ref->page_del->timestamp,
				    &txn->commit_timestamp);
#endif
			break;

		case WT_TXN_OP_TRUNCATE_COL:
		case WT_TXN_OP_TRUNCATE_ROW:
			/* Other operations don't need timestamps. */
			break;
		}

		__wt_txn_op_free(session, op);
	}
	txn->mod_count = 0;

#ifdef HAVE_TIMESTAMPS
	/*
	 * Track the largest commit timestamp we have seen.
	 *
	 * We don't actually clear the local commit timestamp, just the flag.
	 * That said, we can't update the global commit timestamp until this
	 * transaction is visible, which happens when we release it.
	 */
	update_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_COMMIT);
#endif

	__wt_txn_release(session);
	if (locked)
		__wt_readunlock(session, &txn_global->visibility_rwlock);

#ifdef HAVE_TIMESTAMPS
	/* First check if we've already committed something in the future. */
	if (update_timestamp) {
		WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
		    __wt_timestamp_set(
			&prev_commit_timestamp, &txn_global->commit_timestamp));
		update_timestamp = __wt_timestamp_cmp(
		    &txn->commit_timestamp, &prev_commit_timestamp) > 0;
	}

	/*
	 * If it looks like we need to move the global commit timestamp,
	 * write lock and re-check.
	 */
	if (update_timestamp) {
#if WT_TIMESTAMP_SIZE == 8
		while (__wt_timestamp_cmp(
		    &txn->commit_timestamp, &prev_commit_timestamp) > 0) {
			if (__wt_atomic_cas64(
			    &txn_global->commit_timestamp.val,
			    prev_commit_timestamp.val,
			    txn->commit_timestamp.val)) {
				txn_global->has_commit_timestamp = true;
				break;
			}
		    __wt_timestamp_set(
			&prev_commit_timestamp, &txn_global->commit_timestamp);
		}
#else
		__wt_writelock(session, &txn_global->rwlock);
		if (__wt_timestamp_cmp(&txn->commit_timestamp,
		    &txn_global->commit_timestamp) > 0) {
			__wt_timestamp_set(&txn_global->commit_timestamp,
			    &txn->commit_timestamp);
			txn_global->has_commit_timestamp = true;
		}
		__wt_writeunlock(session, &txn_global->rwlock);
#endif
	}
#endif

	/*
	 * We're between transactions, if we need to block for eviction, it's
	 * a good time to do so.  Note that we must ignore any error return
	 * because the user's data is committed.
	 */
	if (!readonly)
		(void)__wt_cache_eviction_check(session, false, false, NULL);
	return (0);

err:	/*
	 * If anything went wrong, roll back.
	 *
	 * !!!
	 * Nothing can fail after this point.
	 */
	if (locked)
		__wt_readunlock(session, &txn_global->visibility_rwlock);
	WT_TRET(__wt_txn_rollback(session, cfg));
	return (ret);
}
Example #6
0
/*
 * __wt_txn_config --
 *	Configure a transaction.
 */
int
__wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_TXN *txn;

	txn = &session->txn;

	WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
	if (cval.len != 0)
		txn->isolation =
		    WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
		    WT_ISO_SNAPSHOT :
		    WT_STRING_MATCH("read-committed", cval.str, cval.len) ?
		    WT_ISO_READ_COMMITTED : WT_ISO_READ_UNCOMMITTED;

	/*
	 * The default sync setting is inherited from the connection, but can
	 * be overridden by an explicit "sync" setting for this transaction.
	 *
	 * We want to distinguish between inheriting implicitly and explicitly.
	 */
	F_CLR(txn, WT_TXN_SYNC_SET);
	WT_RET(__wt_config_gets_def(
	    session, cfg, "sync", (int)UINT_MAX, &cval));
	if (cval.val == 0 || cval.val == 1)
		/*
		 * This is an explicit setting of sync.  Set the flag so
		 * that we know not to overwrite it in commit_transaction.
		 */
		F_SET(txn, WT_TXN_SYNC_SET);

	/*
	 * If sync is turned off explicitly, clear the transaction's sync field.
	 */
	if (cval.val == 0)
		txn->txn_logsync = 0;

	WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval));
	if (cval.len > 0)
		/*
		 * The layering here isn't ideal - the named snapshot get
		 * function does both validation and setup. Otherwise we'd
		 * need to walk the list of named snapshots twice during
		 * transaction open.
		 */
		WT_RET(__wt_txn_named_snapshot_get(session, &cval));

	WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval));
	if (cval.len > 0) {
#ifdef HAVE_TIMESTAMPS
		wt_timestamp_t ts;
		WT_TXN_GLOBAL *txn_global;
		char timestamp_buf[2 * WT_TIMESTAMP_SIZE + 1];
		bool round_to_oldest;

		txn_global = &S2C(session)->txn_global;
		WT_RET(__wt_txn_parse_timestamp(session, "read", &ts, &cval));

		/*
		 * Read the configuration here to reduce the span of the
		 * critical section.
		 */
		WT_RET(__wt_config_gets_def(session,
		    cfg, "round_to_oldest", 0, &cval));
		round_to_oldest = cval.val;
		/*
		 * This code is not using the timestamp validate function to
		 * avoid a race between checking and setting transaction
		 * timestamp.
		 */
		__wt_readlock(session, &txn_global->rwlock);
		if (__wt_timestamp_cmp(&ts, &txn_global->oldest_timestamp) < 0)
		{
			WT_RET(__wt_timestamp_to_hex_string(session,
			    timestamp_buf, &ts));
			/*
			 * If given read timestamp is earlier than oldest
			 * timestamp then round the read timestamp to
			 * oldest timestamp.
			 */
			if (round_to_oldest)
				__wt_timestamp_set(&txn->read_timestamp,
				    &txn_global->oldest_timestamp);
			else {
				__wt_readunlock(session, &txn_global->rwlock);
				WT_RET_MSG(session, EINVAL, "read timestamp "
				    "%s older than oldest timestamp",
				    timestamp_buf);
			}
		} else {
			__wt_timestamp_set(&txn->read_timestamp, &ts);
			/*
			 * Reset to avoid a verbose message as read
			 * timestamp is not rounded to oldest timestamp.
			 */
			round_to_oldest = false;
		}

		__wt_txn_set_read_timestamp(session);
		__wt_readunlock(session, &txn_global->rwlock);
		txn->isolation = WT_ISO_SNAPSHOT;
		if (round_to_oldest) {
			/*
			 * This message is generated here to reduce the span of
			 * critical section.
			 */
			__wt_verbose(session, WT_VERB_TIMESTAMP, "Read "
			    "timestamp %s : Rounded to oldest timestamp",
			    timestamp_buf);
		}
#else
		WT_RET_MSG(session, EINVAL, "read_timestamp requires a "
		    "version of WiredTiger built with timestamp support");
#endif
	}

	return (0);
}