/* * __wt_async_op_enqueue -- * Enqueue an operation onto the work queue. */ int __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op) { WT_ASYNC *async; WT_CONNECTION_IMPL *conn; WT_DECL_RET; uint64_t cur_head, cur_tail, my_alloc, my_slot; #ifdef HAVE_DIAGNOSTIC WT_ASYNC_OP_IMPL *my_op; #endif conn = S2C(session); async = conn->async; /* * If an application re-uses a WT_ASYNC_OP, we end up here with an * invalid object. */ if (op->state != WT_ASYNCOP_READY) WT_RET_MSG(session, EINVAL, "application error: WT_ASYNC_OP already in use"); /* * Enqueue op at the tail of the work queue. * We get our slot in the ring buffer to use. */ my_alloc = WT_ATOMIC_ADD8(async->alloc_head, 1); my_slot = my_alloc % async->async_qsize; /* * Make sure we haven't wrapped around the queue. * If so, wait for the tail to advance off this slot. */ WT_ORDERED_READ(cur_tail, async->tail_slot); while (cur_tail == my_slot) { __wt_yield(); WT_ORDERED_READ(cur_tail, async->tail_slot); } #ifdef HAVE_DIAGNOSTIC WT_ORDERED_READ(my_op, async->async_queue[my_slot]); if (my_op != NULL) return (__wt_panic(session)); #endif WT_PUBLISH(async->async_queue[my_slot], op); op->state = WT_ASYNCOP_ENQUEUED; if (WT_ATOMIC_ADD4(async->cur_queue, 1) > async->max_queue) WT_PUBLISH(async->max_queue, async->cur_queue); /* * Multiple threads may be adding ops to the queue. We need to wait * our turn to make our slot visible to workers. */ WT_ORDERED_READ(cur_head, async->head); while (cur_head != (my_alloc - 1)) { __wt_yield(); WT_ORDERED_READ(cur_head, async->head); } WT_PUBLISH(async->head, my_alloc); return (ret); }
/* * __wt_txn_get_oldest -- * Update the current transaction's cached copy of the oldest snap_min * value. */ void __wt_txn_get_oldest(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; wt_txnid_t id, oldest_snap_min; uint32_t i, session_cnt; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; /* If nothing has changed since last time, we're done. */ if (txn->last_oldest_gen == txn_global->gen) return; txn->last_oldest_gen = txn_global->gen; oldest_snap_min = (txn->id != WT_TXN_NONE) ? txn->id : txn_global->current; WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { if ((id = s->snap_min) != WT_TXN_NONE && TXNID_LT(id, oldest_snap_min)) oldest_snap_min = id; } txn->oldest_snap_min = oldest_snap_min; }
/* * __wt_txn_get_snapshot -- * Set up a snapshot in the current transaction, without allocating an ID. */ void __wt_txn_get_snapshot( WT_SESSION_IMPL *session, wt_txnid_t my_id, wt_txnid_t max_id, int force) { WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; wt_txnid_t current_id, id, oldest_snap_min; uint32_t i, n, session_cnt; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = &txn_global->states[session->id]; /* If nothing has changed since last time, we're done. */ if (!force && txn->last_id == txn_global->current && txn->last_gen == txn_global->gen) { txn_state->snap_min = txn->snap_min; return; } do { /* Take a copy of the current session ID. */ txn->last_gen = txn->last_oldest_gen = txn_global->gen; txn->last_id = oldest_snap_min = current_id = txn_global->current; /* Copy the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* Ignore the session's own transaction. */ if (i == session->id) continue; if ((id = s->snap_min) != WT_TXN_NONE) if (TXNID_LT(id, oldest_snap_min)) oldest_snap_min = id; if ((id = s->id) == WT_TXN_NONE) continue; else if (max_id == WT_TXN_NONE || TXNID_LT(id, max_id)) txn->snapshot[n++] = id; } /* * Ensure the snapshot reads are scheduled before re-checking * the global current ID. */ WT_READ_BARRIER(); } while (current_id != txn_global->current); __txn_sort_snapshot(session, n, (max_id != WT_TXN_NONE) ? max_id : current_id, oldest_snap_min); txn_state->snap_min = (my_id == WT_TXN_NONE || TXNID_LT(txn->snap_min, my_id)) ? txn->snap_min : my_id; }
/* * __split_oldest_gen -- * Calculate the oldest active split generation. */ static uint64_t __split_oldest_gen(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *s; uint64_t gen, oldest; u_int i, session_cnt; conn = S2C(session); WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = conn->sessions, oldest = conn->split_gen + 1; i < session_cnt; i++, s++) if (((gen = s->split_gen) != 0) && gen < oldest) oldest = gen; return (oldest); }
/* * __wt_txn_update_oldest -- * Sweep the running transactions to update the oldest ID required. * !!! * If a data-source is calling the WT_EXTENSION_API.transaction_oldest * method (for the oldest transaction ID not yet visible to a running * transaction), and then comparing that oldest ID against committed * transactions to see if updates for a committed transaction are still * visible to running transactions, the oldest transaction ID may be * the same as the last committed transaction ID, if the transaction * state wasn't refreshed after the last transaction committed. Push * past the last committed transaction. */ void __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) { WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; uint64_t current_id, id, last_running, oldest_id, prev_oldest_id; uint32_t i, session_cnt; int32_t count; int last_running_moved; conn = S2C(session); txn_global = &conn->txn_global; current_id = last_running = txn_global->current; oldest_session = NULL; prev_oldest_id = txn_global->oldest_id; /* * For pure read-only workloads, or if the update isn't forced and the * oldest ID isn't too far behind, avoid scanning. */ if (prev_oldest_id == current_id || (!force && WT_TXNID_LT(current_id, prev_oldest_id + 100))) return; /* * We're going to scan. Increment the count of scanners to prevent the * oldest ID from moving forwards. Spin if the count is negative, * which indicates that some thread is moving the oldest ID forwards. */ do { if ((count = txn_global->scan_count) < 0) WT_PAUSE(); } while (count < 0 || !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1)); /* The oldest ID cannot change until the scan count goes to zero. */ prev_oldest_id = txn_global->oldest_id; current_id = oldest_id = last_running = txn_global->current; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Update the oldest ID. * * Ignore: IDs older than the oldest ID we saw. This can happen * if we race with a thread that is allocating an ID -- the ID * will not be used because the thread will keep spinning until * it gets a valid one. */ if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && WT_TXNID_LT(id, last_running)) last_running = id; /* * !!! * Note: Don't ignore snap_min values older than the previous * oldest ID. Read-uncommitted operations publish snap_min * values without incrementing scan_count to protect the global * table. See the comment in __wt_txn_cursor_op for * more details. */ if ((id = s->snap_min) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) { oldest_id = id; oldest_session = &conn->sessions[i]; } } if (WT_TXNID_LT(last_running, oldest_id)) oldest_id = last_running; /* The oldest ID can't move past any named snapshots. */ if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; /* Update the last running ID. */ last_running_moved = WT_TXNID_LT(txn_global->last_running, last_running); /* Update the oldest ID. */ if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) && WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) { WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, last_running)) last_running = id; if ((id = s->snap_min) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; } if (WT_TXNID_LT(last_running, oldest_id)) oldest_id = last_running; #ifdef HAVE_DIAGNOSTIC /* * Make sure the ID doesn't move past any named snapshots. * * Don't include the read/assignment in the assert statement. * Coverity complains if there are assignments only done in * diagnostic builds, and when the read is from a volatile. */ id = txn_global->nsnap_oldest_id; WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); #endif if (WT_TXNID_LT(txn_global->last_running, last_running)) txn_global->last_running = last_running; if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; WT_ASSERT(session, txn_global->scan_count == -1); txn_global->scan_count = 0; } else { if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && current_id - oldest_id > 10000 && last_running_moved && oldest_session != NULL) { (void)__wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %d [%s]" " with snap_min %" PRIu64 "\n", oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min); } WT_ASSERT(session, txn_global->scan_count > 0); (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); } }
/* * __wt_txn_get_snapshot -- * Allocate a snapshot. */ void __wt_txn_get_snapshot(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; uint64_t current_id, id; uint64_t prev_oldest_id, snap_min; uint32_t i, n, session_cnt; int32_t count; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); /* * We're going to scan. Increment the count of scanners to prevent the * oldest ID from moving forwards. Spin if the count is negative, * which indicates that some thread is moving the oldest ID forwards. */ do { if ((count = txn_global->scan_count) < 0) WT_PAUSE(); } while (count < 0 || !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1)); current_id = snap_min = txn_global->current; prev_oldest_id = txn_global->oldest_id; /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { txn_state->snap_min = current_id; __txn_sort_snapshot(session, 0, current_id); /* Check that the oldest ID has not moved in the meantime. */ if (prev_oldest_id == txn_global->oldest_id) { WT_ASSERT(session, txn_global->scan_count > 0); (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); return; } } /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Build our snapshot of any concurrent transaction IDs. * * Ignore: * - Our own ID: we always read our own updates. * - The ID if it is older than the oldest ID we saw. This * can happen if we race with a thread that is allocating * an ID -- the ID will not be used because the thread will * keep spinning until it gets a valid one. */ if (s != txn_state && (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id)) { txn->snapshot[n++] = id; if (WT_TXNID_LT(id, snap_min)) snap_min = id; } } /* * If we got a new snapshot, update the published snap_min for this * session. */ WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, snap_min)); WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->snap_min = snap_min; WT_ASSERT(session, txn_global->scan_count > 0); (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); __txn_sort_snapshot(session, n, current_id); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int discard) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_DSTAT_INCR(session, cursor_prev); flags = WT_TREE_SKIP_INTL | WT_TREE_PREV; /* Tree walk flags. */ if (discard) LF_SET(WT_TREE_DISCARD); retry: WT_RET(__cursor_func_init(cbt, 0)); __cursor_position_clear(cbt); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt, 0); /* * If this is a modification, we're about to read information from the * page, save the write generation. */ page = cbt->page; if (discard && page != NULL) { WT_ERR(__wt_page_modify_init(session, page)); WT_ORDERED_READ(cbt->write_gen, page->modify->write_gen); } /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ for (newpage = 0;; newpage = 1) { if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = 1; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } cbt->page = NULL; WT_ERR(__wt_tree_walk(session, &page, flags)); WT_ERR_TEST(page == NULL, WT_NOTFOUND); WT_ASSERT(session, page->type != WT_PAGE_COL_INT && page->type != WT_PAGE_ROW_INT); cbt->page = page; /* Initialize the page's modification information */ if (discard) { WT_ERR(__wt_page_modify_init(session, page)); WT_ORDERED_READ( cbt->write_gen, page->modify->write_gen); } /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); } err: if (ret == WT_RESTART) goto retry; WT_TRET(__cursor_func_resolve(cbt, ret)); return (ret); }
/* * __txn_oldest_scan -- * Sweep the running transactions to calculate the oldest ID required. */ static void __txn_oldest_scan(WT_SESSION_IMPL *session, uint64_t *oldest_idp, uint64_t *last_runningp, WT_SESSION_IMPL **oldest_sessionp) { WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; uint64_t id, last_running, oldest_id, prev_oldest_id; uint32_t i, session_cnt; conn = S2C(session); txn_global = &conn->txn_global; oldest_session = NULL; /* The oldest ID cannot change while we are holding the scan lock. */ prev_oldest_id = txn_global->oldest_id; oldest_id = last_running = txn_global->current; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Update the oldest ID. * * Ignore: IDs older than the oldest ID we saw. This can happen * if we race with a thread that is allocating an ID -- the ID * will not be used because the thread will keep spinning until * it gets a valid one. */ if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && WT_TXNID_LT(id, last_running)) last_running = id; /* * !!! * Note: Don't ignore snap_min values older than the previous * oldest ID. Read-uncommitted operations publish snap_min * values without acquiring the scan lock to protect the global * table. See the comment in __wt_txn_cursor_op for * more details. */ if ((id = s->snap_min) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) { oldest_id = id; oldest_session = &conn->sessions[i]; } } if (WT_TXNID_LT(last_running, oldest_id)) oldest_id = last_running; /* The oldest ID can't move past any named snapshots. */ if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; *oldest_idp = oldest_id; *oldest_sessionp = oldest_session; *last_runningp = last_running; }
/* * __wt_txn_get_snapshot -- * Allocate a snapshot. */ int __wt_txn_get_snapshot(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; uint64_t current_id, id; uint64_t prev_oldest_id, snap_min; uint32_t i, n, session_cnt; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); n = 0; /* * Spin waiting for the lock: the sleeps in our blocking readlock * implementation are too slow for scanning the transaction table. */ while ((ret = __wt_try_readlock(session, txn_global->scan_rwlock)) == EBUSY) WT_PAUSE(); WT_RET(ret); current_id = snap_min = txn_global->current; prev_oldest_id = txn_global->oldest_id; /* * Include the checkpoint transaction, if one is running: we should * ignore any uncommitted changes the checkpoint has written to the * metadata. We don't have to keep the checkpoint's changes pinned so * don't including it in the published snap_min. */ if ((id = txn_global->checkpoint_txnid) != WT_TXN_NONE) txn->snapshot[n++] = id; /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { txn_state->snap_min = current_id; /* Check that the oldest ID has not moved in the meantime. */ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); goto done; } /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Build our snapshot of any concurrent transaction IDs. * * Ignore: * - Our own ID: we always read our own updates. * - The ID if it is older than the oldest ID we saw. This * can happen if we race with a thread that is allocating * an ID -- the ID will not be used because the thread will * keep spinning until it gets a valid one. */ if (s != txn_state && (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id)) { txn->snapshot[n++] = id; if (WT_TXNID_LT(id, snap_min)) snap_min = id; } } /* * If we got a new snapshot, update the published snap_min for this * session. */ WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, snap_min)); WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->snap_min = snap_min; done: __wt_readunlock(session, txn_global->scan_rwlock); __txn_sort_snapshot(session, n, current_id); return (0); }
/* * __wt_txn_refresh -- * Allocate a transaction ID and/or a snapshot. */ void __wt_txn_refresh(WT_SESSION_IMPL *session, uint64_t max_id, int get_snapshot) { WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; uint64_t current_id, id, snap_min, oldest_id, prev_oldest_id; uint32_t i, n, session_cnt; int32_t count; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = &txn_global->states[session->id]; prev_oldest_id = txn_global->oldest_id; current_id = snap_min = txn_global->current; /* For pure read-only workloads, avoid updates to shared state. */ if (!get_snapshot) { /* * If we are trying to update the oldest ID and it is already * equal to the current ID, there is no point scanning. */ if (prev_oldest_id == current_id) return; } else if (txn->id == max_id && txn->snapshot_count == 0 && txn->snap_min == snap_min && TXNID_LE(prev_oldest_id, snap_min)) { txn_state->snap_min = txn->snap_min; /* If nothing has changed in the meantime, we're done. */ if (txn_global->scan_count == 0 && txn_global->oldest_id == prev_oldest_id) return; } /* * We're going to scan. Increment the count of scanners to prevent the * oldest ID from moving forwards. Spin if the count is negative, * which indicates that some thread is moving the oldest ID forwards. */ do { if ((count = txn_global->scan_count) < 0) WT_PAUSE(); } while (count < 0 || !WT_ATOMIC_CAS(txn_global->scan_count, count, count + 1)); /* The oldest ID cannot change until the scan count goes to zero. */ prev_oldest_id = txn_global->oldest_id; current_id = snap_min = txn_global->current; /* If the maximum ID is constrained, so is the oldest. */ oldest_id = (max_id != WT_TXN_NONE) ? max_id : snap_min; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Ignore the ID if we are committing (indicated by max_id * being set): it is about to be released. * * Also ignore the ID if it is older than the oldest ID we saw. * This can happen if we race with a thread that is allocating * an ID -- the ID will not be used because the thread will * keep spinning until it gets a valid one. */ if ((id = s->id) != WT_TXN_NONE && id + 1 != max_id && TXNID_LE(prev_oldest_id, id)) { if (get_snapshot) txn->snapshot[n++] = id; if (TXNID_LT(id, snap_min)) snap_min = id; } /* * Ignore the session's own snap_min if we are in the process * of updating it. */ if (get_snapshot && s == txn_state) continue; /* * !!! * Note: Don't ignore snap_min values older than the previous * oldest ID. Read-uncommitted operations publish snap_min * values without incrementing scan_count to protect the global * table. See the comment in __wt_txn_cursor_op for * more details. */ if ((id = s->snap_min) != WT_TXN_NONE && TXNID_LT(id, oldest_id)) oldest_id = id; } if (TXNID_LT(snap_min, oldest_id)) oldest_id = snap_min; if (get_snapshot) { WT_ASSERT(session, TXNID_LE(prev_oldest_id, snap_min)); WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->snap_min = snap_min; } /* * Update the last running ID if we have a much newer value or we are * forcing an update. */ if (!get_snapshot || snap_min > txn_global->last_running + 100) txn_global->last_running = snap_min; /* * Update the oldest ID if we have a newer ID and we can get exclusive * access. During normal snapshot refresh, only do this if we have a * much newer value. Once we get exclusive access, do another pass to * make sure nobody else is using an earlier ID. */ if (max_id == WT_TXN_NONE && TXNID_LT(prev_oldest_id, oldest_id) && (!get_snapshot || oldest_id - prev_oldest_id > 100) && WT_ATOMIC_CAS(txn_global->scan_count, 1, -1)) { WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { if ((id = s->id) != WT_TXN_NONE && TXNID_LT(id, oldest_id)) oldest_id = id; if ((id = s->snap_min) != WT_TXN_NONE && TXNID_LT(id, oldest_id)) oldest_id = id; } if (TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; txn_global->scan_count = 0; } else { WT_ASSERT(session, txn_global->scan_count > 0); (void)WT_ATOMIC_SUB(txn_global->scan_count, 1); } if (get_snapshot) __txn_sort_snapshot(session, n, current_id); }
/* * __txn_oldest_scan -- * Sweep the running transactions to calculate the oldest ID required. */ static void __txn_oldest_scan(WT_SESSION_IMPL *session, uint64_t *oldest_idp, uint64_t *last_runningp, uint64_t *metadata_pinnedp, WT_SESSION_IMPL **oldest_sessionp) { WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; uint64_t id, last_running, metadata_pinned, oldest_id, prev_oldest_id; uint32_t i, session_cnt; conn = S2C(session); txn_global = &conn->txn_global; oldest_session = NULL; /* The oldest ID cannot change while we are holding the scan lock. */ prev_oldest_id = txn_global->oldest_id; last_running = oldest_id = txn_global->current; if ((metadata_pinned = txn_global->checkpoint_state.id) == WT_TXN_NONE) metadata_pinned = oldest_id; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* Update the last running transaction ID. */ if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && WT_TXNID_LT(id, last_running)) last_running = id; /* Update the metadata pinned ID. */ if ((id = s->metadata_pinned) != WT_TXN_NONE && WT_TXNID_LT(id, metadata_pinned)) metadata_pinned = id; /* * !!! * Note: Don't ignore pinned ID values older than the previous * oldest ID. Read-uncommitted operations publish pinned ID * values without acquiring the scan lock to protect the global * table. See the comment in __wt_txn_cursor_op for more * details. */ if ((id = s->pinned_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) { oldest_id = id; oldest_session = &conn->sessions[i]; } } if (WT_TXNID_LT(last_running, oldest_id)) oldest_id = last_running; /* The oldest ID can't move past any named snapshots. */ if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; /* The metadata pinned ID can't move past the oldest ID. */ if (WT_TXNID_LT(oldest_id, metadata_pinned)) metadata_pinned = oldest_id; *last_runningp = last_running; *metadata_pinnedp = metadata_pinned; *oldest_idp = oldest_id; *oldest_sessionp = oldest_session; }
/* * __wt_verbose_dump_txn -- * Output diagnostic information about the global transaction state. */ int __wt_verbose_dump_txn(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *sess; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; uint64_t id; uint32_t i, session_cnt; #ifdef HAVE_TIMESTAMPS char hex_timestamp[3][2 * WT_TIMESTAMP_SIZE + 1]; #endif conn = S2C(session); txn_global = &conn->txn_global; WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); WT_RET(__wt_msg(session, "transaction state dump")); WT_RET(__wt_msg(session, "current ID: %" PRIu64, txn_global->current)); WT_RET(__wt_msg(session, "last running ID: %" PRIu64, txn_global->last_running)); WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id)); #ifdef HAVE_TIMESTAMPS WT_RET(__wt_timestamp_to_hex_string( session, hex_timestamp[0], &txn_global->commit_timestamp)); WT_RET(__wt_msg(session, "commit timestamp: %s", hex_timestamp[0])); WT_RET(__wt_timestamp_to_hex_string( session, hex_timestamp[0], &txn_global->oldest_timestamp)); WT_RET(__wt_msg(session, "oldest timestamp: %s", hex_timestamp[0])); WT_RET(__wt_timestamp_to_hex_string( session, hex_timestamp[0], &txn_global->pinned_timestamp)); WT_RET(__wt_msg(session, "pinned timestamp: %s", hex_timestamp[0])); WT_RET(__wt_timestamp_to_hex_string( session, hex_timestamp[0], &txn_global->stable_timestamp)); WT_RET(__wt_msg(session, "stable timestamp: %s", hex_timestamp[0])); WT_RET(__wt_msg(session, "has_commit_timestamp: %s", txn_global->has_commit_timestamp ? "yes" : "no")); WT_RET(__wt_msg(session, "has_oldest_timestamp: %s", txn_global->has_oldest_timestamp ? "yes" : "no")); WT_RET(__wt_msg(session, "has_pinned_timestamp: %s", txn_global->has_pinned_timestamp ? "yes" : "no")); WT_RET(__wt_msg(session, "has_stable_timestamp: %s", txn_global->has_stable_timestamp ? "yes" : "no")); WT_RET(__wt_msg(session, "oldest_is_pinned: %s", txn_global->oldest_is_pinned ? "yes" : "no")); WT_RET(__wt_msg(session, "stable_is_pinned: %s", txn_global->stable_is_pinned ? "yes" : "no")); #endif WT_RET(__wt_msg(session, "checkpoint running: %s", txn_global->checkpoint_running ? "yes" : "no")); WT_RET(__wt_msg(session, "checkpoint generation: %" PRIu64, __wt_gen(session, WT_GEN_CHECKPOINT))); WT_RET(__wt_msg(session, "checkpoint pinned ID: %" PRIu64, txn_global->checkpoint_state.pinned_id)); WT_RET(__wt_msg(session, "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_state.id)); WT_RET(__wt_msg(session, "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id)); WT_ORDERED_READ(session_cnt, conn->session_cnt); WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt)); WT_RET(__wt_msg(session, "Transaction state of active sessions:")); /* * Walk each session transaction state and dump information. Accessing * the content of session handles is not thread safe, so some * information may change while traversing if other threads are active * at the same time, which is OK since this is diagnostic code. */ for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* Skip sessions with no active transaction */ if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE) continue; sess = &conn->sessions[i]; WT_RET(__wt_msg(session, "ID: %" PRIu64 ", pinned ID: %" PRIu64 ", metadata pinned ID: %" PRIu64 ", name: %s", id, s->pinned_id, s->metadata_pinned, sess->name == NULL ? "EMPTY" : sess->name)); WT_RET(__wt_verbose_dump_txn_one(sess, &sess->txn)); } return (0); }
/* * __wt_txn_begin -- * Begin a transaction. */ int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; wt_txnid_t id, oldest_snap_min; uint32_t i, n, session_cnt; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = &txn_global->states[session->id]; WT_ASSERT(session, txn_state->id == WT_TXN_NONE); WT_RET(__wt_config_gets_defno(session, cfg, "isolation", &cval)); if (cval.len == 0) txn->isolation = session->isolation; else txn->isolation = WT_STRING_MATCH("snapshot", cval.str, cval.len) ? TXN_ISO_SNAPSHOT : WT_STRING_MATCH("read-committed", cval.str, cval.len) ? TXN_ISO_READ_COMMITTED : TXN_ISO_READ_UNCOMMITTED; F_SET(txn, TXN_RUNNING); do { /* * Allocate a transaction ID. * * We use an atomic increment to ensure that we get a unique * ID, then publish that to the global state table. * * If two threads race to allocate an ID, only the latest ID * will proceed. The winning thread can be sure its snapshot * contains all of the earlier active IDs. Threads that race * and get an earlier ID may not appear in the snapshot, * but they will loop and allocate a new ID before proceeding * to make any updates. * * This potentially wastes transaction IDs when threads race to * begin transactions, but that is the price we pay to keep * this path latch free. */ do { txn->id = WT_ATOMIC_ADD(txn_global->current, 1); } while (txn->id == WT_TXN_NONE || txn->id == WT_TXN_ABORTED); WT_PUBLISH(txn_state->id, txn->id); /* * If we are starting a snapshot isolation transaction, get * a snapshot of the running transactions. * * If we already have a snapshot (e.g., for an auto-commit * operation), update it so that the newly-allocated ID is * visible. */ if (txn->isolation == TXN_ISO_SNAPSHOT) { txn->last_gen = txn->last_oldest_gen = txn_global->gen; oldest_snap_min = txn->id; /* Copy the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { if ((id = s->snap_min) != WT_TXN_NONE) if (TXNID_LT(id, oldest_snap_min)) oldest_snap_min = id; if ((id = s->id) == WT_TXN_NONE) continue; else txn->snapshot[n++] = id; } __txn_sort_snapshot( session, n, txn->id, oldest_snap_min); txn_state->snap_min = txn->snap_min; } /* * Ensure the snapshot reads are complete before re-checking * the global current ID. */ WT_READ_BARRIER(); } while (txn->id != txn_global->current); return (0); }