Esempio n. 1
0
/*
 * __wt_txn_begin --
 *	Begin a transaction.
 */
int
__wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_TXN *txn;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_STATE *txn_state;

	conn = S2C(session);
	txn = &session->txn;
	txn_global = &conn->txn_global;
	txn_state = &txn_global->states[session->id];

	WT_ASSERT(session, txn_state->id == WT_TXN_NONE);

	WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
	if (cval.len == 0)
		txn->isolation = session->isolation;
	else
		txn->isolation =
		    WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
		    TXN_ISO_SNAPSHOT :
		    WT_STRING_MATCH("read-committed", cval.str, cval.len) ?
		    TXN_ISO_READ_COMMITTED : TXN_ISO_READ_UNCOMMITTED;

	/*
	 * Allocate a transaction ID.
	 *
	 * We use an atomic compare and swap to ensure that we get a
	 * unique ID that is published before the global counter is
	 * updated.
	 *
	 * If two threads race to allocate an ID, only the latest ID
	 * will proceed.  The winning thread can be sure its snapshot
	 * contains all of the earlier active IDs.  Threads that race
	 * and get an earlier ID may not appear in the snapshot, but
	 * they will loop and allocate a new ID before proceeding to
	 * make any updates.
	 *
	 * This potentially wastes transaction IDs when threads race to
	 * begin transactions: that is the price we pay to keep this
	 * path latch free.
	 */
	do {
		txn_state->id = txn->id = txn_global->current;
	} while (!WT_ATOMIC_CAS(txn_global->current, txn->id, txn->id + 1));

	/*
	 * If we have used 64-bits of transaction IDs, there is nothing
	 * more we can do.
	 */
	if (txn->id == WT_TXN_ABORTED)
		WT_RET_MSG(session, ENOMEM, "Out of transaction IDs");

	F_SET(txn, TXN_RUNNING);
	if (txn->isolation == TXN_ISO_SNAPSHOT)
		__wt_txn_refresh(session, WT_TXN_NONE, 1);
	return (0);
}
Esempio n. 2
0
/*
 * __wt_txn_commit --
 *	Commit the current transaction.
 */
int
__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_DECL_RET;
	WT_TXN *txn;
	WT_TXN_OP *op;
	u_int i;

	WT_UNUSED(cfg);

	txn = &session->txn;
	WT_ASSERT(session, !F_ISSET(txn, TXN_ERROR));

	if (!F_ISSET(txn, TXN_RUNNING))
		WT_RET_MSG(session, EINVAL, "No transaction is active");

	/* Commit notification. */
	if (txn->notify != NULL)
		WT_TRET(txn->notify->notify(txn->notify,
		    (WT_SESSION *)session, txn->id, 1));

	/* If we are logging, write a commit log record. */
	if (ret == 0 &&
	    txn->mod_count > 0 && S2C(session)->logging &&
	    !F_ISSET(session, WT_SESSION_LOGGING_DISABLED))
		ret = __wt_txn_log_commit(session, cfg);

	/*
	 * If anything went wrong, roll back.
	 *
	 * !!!
	 * Nothing can fail after this point.
	 */
	if (ret != 0) {
		WT_TRET(__wt_txn_rollback(session, cfg));
		return (ret);
	}

	/* Free memory associated with updates. */
	for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++)
		__wt_txn_op_free(session, op);

	/*
	 * Auto-commit transactions need a new transaction snapshot so that the
	 * committed changes are visible to subsequent reads.  However, cursor
	 * keys and values will point to the data that was just modified, so
	 * the snapshot cannot be so new that updates could be freed underneath
	 * the cursor.  Get the new snapshot before releasing the ID for the
	 * commit.
	 */
	if (session->ncursors > 0 && txn->isolation != TXN_ISO_READ_UNCOMMITTED)
		__wt_txn_refresh(session, txn->id + 1, 1);
	__wt_txn_release(session);
	return (0);
}
Esempio n. 3
0
/*
 * __wt_txn_update_oldest --
 *	Sweep the running transactions to update the oldest ID required.
 */
void
__wt_txn_update_oldest(WT_SESSION_IMPL *session)
{
	/*
	 * !!!
	 * If a data-source is calling the WT_EXTENSION_API.transaction_oldest
	 * method (for the oldest transaction ID not yet visible to a running
	 * transaction), and then comparing that oldest ID against committed
	 * transactions to see if updates for a committed transaction are still
	 * visible to running transactions, the oldest transaction ID may be
	 * the same as the last committed transaction ID, if the transaction
	 * state wasn't refreshed after the last transaction committed.  Push
	 * past the last committed transaction.
	 */
	__wt_txn_refresh(session, WT_TXN_NONE, 0);
}
Esempio n. 4
0
/*
 * __sync_file --
 *	Flush pages for a specific file.
 */
static int
__sync_file(WT_SESSION_IMPL *session, int syncop)
{
	struct timespec end, start;
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_REF *walk;
	WT_TXN *txn;
	uint64_t internal_bytes, leaf_bytes;
	uint64_t internal_pages, leaf_pages;
	uint32_t flags;
	int evict_reset;

	btree = S2BT(session);

	flags = WT_READ_CACHE | WT_READ_NO_GEN;
	walk = NULL;
	txn = &session->txn;

	internal_bytes = leaf_bytes = 0;
	internal_pages = leaf_pages = 0;
	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
		WT_RET(__wt_epoch(session, &start));

	switch (syncop) {
	case WT_SYNC_WRITE_LEAVES:
		/*
		 * Write all immediately available, dirty in-cache leaf pages.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.
		 */
		if (!btree->modified)
			return (0);
		__wt_spin_lock(session, &btree->flush_lock);
		if (!btree->modified) {
			__wt_spin_unlock(session, &btree->flush_lock);
			return (0);
		}

		flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
		for (walk = NULL;;) {
			WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages if nobody beat us to it.  Don't
			 * try to write the hottest pages: checkpoint will have
			 * to visit them anyway.
			 */
			page = walk->page;
			if (__wt_page_is_modified(page) &&
			    __wt_txn_visible_all(
			    session, page->modify->update_txn)) {
				if (txn->isolation == TXN_ISO_READ_COMMITTED)
					__wt_txn_refresh(session, 1);
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
				WT_ERR(__wt_reconcile(session, walk, NULL, 0));
			}
		}
		break;
	case WT_SYNC_CHECKPOINT:
		/*
		 * We cannot check the tree modified flag in the case of a
		 * checkpoint, the checkpoint code has already cleared it.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.  We're holding the schema lock, but need the
		 * lower-level lock as well.
		 */
		__wt_spin_lock(session, &btree->flush_lock);

		/*
		 * When internal pages are being reconciled by checkpoint their
		 * child pages cannot disappear from underneath them or be split
		 * into them, nor can underlying blocks be freed until the block
		 * lists for the checkpoint are stable.  Set the checkpointing
		 * flag to block eviction of dirty pages until the checkpoint's
		 * internal page pass is complete, then wait for any existing
		 * eviction to complete.
		 */
		btree->checkpointing = 1;
		WT_FULL_BARRIER();

		WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
		if (evict_reset)
			__wt_evict_file_exclusive_off(session);

		/* Write all dirty in-cache pages. */
		flags |= WT_READ_NO_EVICT;
		for (walk = NULL;;) {
			WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages, unless we can be sure they only
			 * became dirty after the checkpoint started.
			 *
			 * We can skip dirty pages if:
			 * (1) they are leaf pages;
			 * (2) there is a snapshot transaction active (which
			 *     is the case in ordinary application checkpoints
			 *     but not all internal cases); and
			 * (3) the first dirty update on the page is
			 *     sufficiently recent that the checkpoint
			 *     transaction would skip them.
			 */
			page = walk->page;
			mod = page->modify;
			if (__wt_page_is_modified(page) &&
			    (WT_PAGE_IS_INTERNAL(page) ||
			    !F_ISSET(txn, TXN_HAS_SNAPSHOT) ||
			    TXNID_LE(mod->first_dirty_txn, txn->snap_max))) {
				if (WT_PAGE_IS_INTERNAL(page)) {
					internal_bytes +=
					    page->memory_footprint;
					++internal_pages;
				} else {
					leaf_bytes += page->memory_footprint;
					++leaf_pages;
				}
				WT_ERR(__wt_reconcile(session, walk, NULL, 0));
			}
		}
		break;
	}

	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		WT_ERR(__wt_epoch(session, &end));
		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of internal\n\t"
		    "Took: %" PRIu64 "ms",
		    syncop == WT_SYNC_WRITE_LEAVES ?
		    "WRITE_LEAVES" : "CHECKPOINT",
		    leaf_bytes, leaf_pages, internal_bytes, internal_pages,
		    WT_TIMEDIFF(end, start) / WT_MILLION));
	}

err:	/* On error, clear any left-over tree walk. */
	if (walk != NULL)
		WT_TRET(__wt_page_release(session, walk, flags));

	if (txn->isolation == TXN_ISO_READ_COMMITTED && session->ncursors == 0)
		__wt_txn_release_snapshot(session);

	if (btree->checkpointing) {
		/*
		 * Clear the checkpoint flag and push the change; not required,
		 * but publishing the change means stalled eviction gets moving
		 * as soon as possible.
		 */
		btree->checkpointing = 0;
		WT_FULL_BARRIER();

		/*
		 * If this tree was being skipped by the eviction server during
		 * the checkpoint, clear the wait.
		 */
		btree->evict_walk_period = 0;

		/*
		 * Wake the eviction server, in case application threads have
		 * stalled while the eviction server decided it couldn't make
		 * progress.  Without this, application threads will be stalled
		 * until the eviction server next wakes.
		 */
		WT_TRET(__wt_evict_server_wake(session));
	}

	__wt_spin_unlock(session, &btree->flush_lock);

	/*
	 * Leaves are written before a checkpoint (or as part of a file close,
	 * before checkpointing the file).  Start a flush to stable storage,
	 * but don't wait for it.
	 */
	if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES)
		WT_RET(btree->bm->sync(btree->bm, session, 1));

	return (ret);
}