/* * __wt_txn_init -- * Initialize a session's transaction data. */ int __wt_txn_init(WT_SESSION_IMPL *session) { WT_TXN *txn; txn = &session->txn; txn->id = WT_TXN_NONE; WT_RET(__wt_calloc_def(session, S2C(session)->session_size, &txn->snapshot)); #ifdef HAVE_DIAGNOSTIC if (S2C(session)->txn_global.states != NULL) { WT_TXN_STATE *txn_state; txn_state = WT_SESSION_TXN_STATE(session); WT_ASSERT(session, txn_state->snap_min == WT_TXN_NONE); } #endif /* * Take care to clean these out in case we are reusing the transaction * for eviction. */ txn->mod = NULL; txn->isolation = session->isolation; return (0); }
/* * __wt_txn_release -- * Release the resources associated with the current transaction. */ void __wt_txn_release(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; txn = &session->txn; txn_global = &S2C(session)->txn_global; txn_state = WT_SESSION_TXN_STATE(session); WT_ASSERT(session, txn->mod_count == 0); txn->notify = NULL; /* Clear the transaction's ID from the global table. */ if (WT_SESSION_IS_CHECKPOINT(session)) { WT_ASSERT(session, txn_state->id == WT_TXN_NONE); txn->id = txn_global->checkpoint_state.id = txn_global->checkpoint_state.pinned_id = WT_TXN_NONE; /* * Be extra careful to cleanup everything for checkpoints: once * the global checkpoint ID is cleared, we can no longer tell * if this session is doing a checkpoint. */ txn_global->checkpoint_id = 0; } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running)); WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); txn->id = WT_TXN_NONE; } __wt_txn_clear_commit_timestamp(session); __wt_txn_clear_read_timestamp(session); /* Free the scratch buffer allocated for logging. */ __wt_logrec_free(session, &txn->logrec); /* Discard any memory from the session's stash that we can. */ WT_ASSERT(session, __wt_session_gen(session, WT_GEN_SPLIT) == 0); __wt_stash_discard(session); /* * Reset the transaction state to not running and release the snapshot. */ __wt_txn_release_snapshot(session); txn->isolation = session->isolation; /* Ensure the transaction flags are cleared on exit */ txn->flags = 0; }
/* * __wt_txn_release -- * Release the resources associated with the current transaction. */ void __wt_txn_release(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; txn = &session->txn; WT_ASSERT(session, txn->mod_count == 0); txn->notify = NULL; txn_global = &S2C(session)->txn_global; txn_state = WT_SESSION_TXN_STATE(session); /* Clear the transaction's ID from the global table. */ if (WT_SESSION_IS_CHECKPOINT(session)) { WT_ASSERT(session, txn_state->id == WT_TXN_NONE); txn->id = WT_TXN_NONE; /* Clear the global checkpoint transaction IDs. */ txn_global->checkpoint_id = 0; txn_global->checkpoint_pinned = WT_TXN_NONE; } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running)); WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); txn->id = WT_TXN_NONE; } /* Free the scratch buffer allocated for logging. */ __wt_logrec_free(session, &txn->logrec); /* Discard any memory from the session's split stash that we can. */ WT_ASSERT(session, session->split_gen == 0); if (session->split_stash_cnt > 0) __wt_split_stash_discard(session); /* * Reset the transaction state to not running and release the snapshot. */ __wt_txn_release_snapshot(session); txn->isolation = session->isolation; /* Ensure the transaction flags are cleared on exit */ txn->flags = 0; }
/* * __wt_txn_release_snapshot -- * Release the snapshot in the current transaction. */ void __wt_txn_release_snapshot(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_STATE *txn_state; txn = &session->txn; txn_state = WT_SESSION_TXN_STATE(session); WT_ASSERT(session, txn_state->snap_min == WT_TXN_NONE || session->txn.isolation == WT_ISO_READ_UNCOMMITTED || !__wt_txn_visible_all(session, txn_state->snap_min)); txn_state->snap_min = WT_TXN_NONE; F_CLR(txn, WT_TXN_HAS_SNAPSHOT); }
/* * __wt_session_fotxn_add -- * Add a new entry into the session's free-on-transaction generation list. */ int __wt_session_fotxn_add(WT_SESSION_IMPL *session, void *p, size_t len) { WT_FOTXN *fotxn; size_t i; /* * Make sure the current thread has a transaction pinned so that * we don't immediately free the memory we are stashing. */ WT_ASSERT(session, WT_SESSION_TXN_STATE(session)->snap_min != WT_TXN_NONE); /* Grow the list as necessary. */ WT_RET(__wt_realloc_def(session, &session->fotxn_size, session->fotxn_cnt + 1, &session->fotxn)); /* Find an empty slot. */ for (i = 0, fotxn = session->fotxn; i < session->fotxn_size / sizeof(session->fotxn[0]); ++i, ++fotxn) if (fotxn->p == NULL) { fotxn->txnid = S2C(session)->txn_global.current + 1; WT_ASSERT(session, !__wt_txn_visible_all(session, fotxn->txnid)); fotxn->p = p; fotxn->len = len; break; } ++session->fotxn_cnt; /* See if we can free any previous entries. */ if (session->fotxn_cnt > 1) __wt_session_fotxn_discard(session, session, 0); return (0); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *prev, *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_pinned_id, time_start, time_stop; uint32_t flags; bool timer, tried_eviction; conn = S2C(session); btree = S2BT(session); prev = walk = NULL; txn = &session->txn; tried_eviction = false; time_start = time_stop = 0; /* Only visit pages in cache and don't bump page read generations. */ flags = WT_READ_CACHE | WT_READ_NO_GEN; /* * Skip all deleted pages. For a page to be marked deleted, it must * have been evicted from cache and marked clean. Checkpoint should * never instantiate deleted pages: if a truncate is not visible to the * checkpoint, the on-disk version is correct. If the truncate is * visible, we skip over the child page when writing its parent. We * check whether a truncate is visible in the checkpoint as part of * reconciling internal pages (specifically in __rec_child_modify). */ LF_SET(WT_READ_DELETED_SKIP); internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT); if (timer) time_start = __wt_clock(session); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } /* * Save the oldest transaction ID we need to keep around. * Otherwise, in a busy system, we could be updating pages so * fast that write leaves never catches up. We deliberately * have no transaction running at this point that would keep * the oldest ID from moving forwards as we walk the tree. */ oldest_id = __wt_txn_oldest_id(session); LF_SET(WT_READ_NO_WAIT | WT_READ_SKIP_INTL); for (;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write hot pages (defined as pages that have * been updated since the write phase leaves started): * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, WT_REC_CHECKPOINT, NULL)); } } break; case WT_SYNC_CHECKPOINT: /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make * a schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. * We don't update the snapshot in between pages because the * metadata shouldn't have many pages. Instead, read-committed * isolation ensures that all metadata updates completed before * the checkpoint are included. */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * In the final checkpoint pass, child pages cannot be evicted * from underneath internal pages nor can underlying blocks be * freed until the checkpoint's block lists are stable. Also, * we cannot split child pages into parents unless we know the * final pass will write a consistent view of that namespace. * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ WT_ASSERT(session, btree->syncing == WT_BTREE_SYNC_OFF && btree->sync_session == NULL); btree->sync_session = session; btree->syncing = WT_BTREE_SYNC_WAIT; (void)__wt_gen_next_drain(session, WT_GEN_EVICT); btree->syncing = WT_BTREE_SYNC_RUNNING; /* Write all dirty in-cache pages. */ LF_SET(WT_READ_NO_EVICT); /* Read pages with lookaside entries and evict them asap. */ LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED); for (;;) { WT_ERR(__sync_dup_walk(session, walk, flags, &prev)); WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Skip clean pages, but need to make sure maximum * transaction ID is always updated. */ if (!__wt_page_is_modified(walk->page)) { if (((mod = walk->page->modify) != NULL) && mod->rec_max_txn > btree->rec_max_txn) btree->rec_max_txn = mod->rec_max_txn; if (mod != NULL && btree->rec_max_timestamp < mod->rec_max_timestamp) btree->rec_max_timestamp = mod->rec_max_timestamp; continue; } /* * Take a local reference to the page modify structure * now that we know the page is dirty. It needs to be * done in this order otherwise the page modify * structure could have been created between taking the * reference and checking modified. */ page = walk->page; /* * Write dirty pages, if we can't skip them. If we skip * a page, mark the tree dirty. The checkpoint marked it * clean and we can't skip future checkpoints until this * page is written. */ if (__sync_checkpoint_can_skip(session, page)) { __wt_tree_modify_set(session); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } /* * If the page was pulled into cache by our read, try * to evict it now. * * For eviction to have a chance, we first need to move * the walk point to the next page checkpoint will * visit. We want to avoid this code being too special * purpose, so try to reuse the ordinary eviction path. * * Regardless of whether eviction succeeds or fails, * the walk continues from the previous location. We * remember whether we tried eviction, and don't try * again. Even if eviction fails (the page may stay in * cache clean but with history that cannot be * discarded), that is not wasted effort because * checkpoint doesn't need to write the page again. */ if (!WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_WONT_NEED && !tried_eviction) { WT_ERR_BUSY_OK( __wt_page_release_evict(session, walk)); walk = prev; prev = NULL; tried_eviction = true; continue; } tried_eviction = false; WT_ERR(__wt_reconcile( session, walk, NULL, WT_REC_CHECKPOINT, NULL)); /* * Update checkpoint IO tracking data if configured * to log verbose progress messages. */ if (conn->ckpt_timer_start.tv_sec > 0) { conn->ckpt_write_bytes += page->memory_footprint; ++conn->ckpt_write_pages; /* Periodically log checkpoint progress. */ if (conn->ckpt_write_pages % 5000 == 0) __wt_checkpoint_progress( session, false); } } break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: WT_ERR(__wt_illegal_value(session, syncop)); break; } if (timer) { time_stop = __wt_clock(session); __wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote: %" PRIu64 " leaf pages (%" PRIu64 "B), %" PRIu64 " internal pages (%" PRIu64 "B), and took %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_pages, leaf_bytes, internal_pages, internal_bytes, WT_CLOCKDIFF_MS(time_stop, time_start)); } err: /* On error, clear any left-over tree walk. */ WT_TRET(__wt_page_release(session, walk, flags)); WT_TRET(__wt_page_release(session, prev, flags)); /* * If we got a snapshot in order to write pages, and there was no * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && saved_pinned_id == WT_TXN_NONE) __wt_txn_release_snapshot(session); /* Clear the checkpoint flag. */ btree->syncing = WT_BTREE_SYNC_OFF; btree->sync_session = NULL; __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, false)); return (ret); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { struct timespec end, start; WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_snap_min; uint32_t flags; conn = S2C(session); btree = S2BT(session); walk = NULL; txn = &session->txn; saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min; flags = WT_READ_CACHE | WT_READ_NO_GEN; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) WT_RET(__wt_epoch(session, &start)); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } /* * Save the oldest transaction ID we need to keep around. * Otherwise, in a busy system, we could be updating pages so * fast that write leaves never catches up. We deliberately * have no transaction running at this point that would keep * the oldest ID from moving forwards as we walk the tree. */ oldest_id = __wt_txn_oldest_id(session); flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write hot pages (defined as pages that have * been updated since the write phase leaves started): * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; case WT_SYNC_CHECKPOINT: /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make * schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. * We don't update the snapshot in between pages because (a) * the metadata shouldn't be that big, and (b) if we do ever */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * In the final checkpoint pass, child pages cannot be evicted * from underneath internal pages nor can underlying blocks be * freed until the checkpoint's block lists are stable. Also, * we cannot split child pages into parents unless we know the * final pass will write a consistent view of that namespace. * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE); WT_ERR(__wt_evict_file_exclusive_on(session)); __wt_evict_file_exclusive_off(session); WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING); /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* Skip clean pages. */ if (!__wt_page_is_modified(walk->page)) continue; /* * Take a local reference to the page modify structure * now that we know the page is dirty. It needs to be * done in this order otherwise the page modify * structure could have been created between taking the * reference and checking modified. */ page = walk->page; mod = page->modify; /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. * * We can skip dirty pages if: * (1) they are leaf pages; * (2) there is a snapshot transaction active (which * is the case in ordinary application checkpoints * but not all internal cases); and * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. * * Mark the tree dirty: the checkpoint marked it clean * and we can't skip future checkpoints until this page * is written. */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { __wt_page_modify_set(session, page); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: WT_ILLEGAL_VALUE_ERR(session); } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_ERR(__wt_epoch(session, &end)); WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 " bytes, %" PRIu64 " pages of internal\n\t" "Took: %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, WT_TIMEDIFF_MS(end, start))); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); /* * If we got a snapshot in order to write pages, and there was no * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && saved_snap_min == WT_TXN_NONE) __wt_txn_release_snapshot(session); if (btree->checkpointing != WT_CKPT_OFF) { /* * Update the checkpoint generation for this handle so visible * updates newer than the checkpoint can be evicted. * * This has to be published before eviction is enabled again, * so that eviction knows that the checkpoint has completed. */ WT_PUBLISH(btree->checkpoint_gen, conn->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. */ btree->checkpointing = WT_CKPT_OFF; WT_FULL_BARRIER(); /* * If this tree was being skipped by the eviction server during * the checkpoint, clear the wait. */ btree->evict_walk_period = 0; /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ WT_TRET(__wt_evict_server_wake(session)); } __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, true)); return (ret); }
/* * __wt_txn_release -- * Release the resources associated with the current transaction. */ void __wt_txn_release(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; int was_oldest; txn = &session->txn; WT_ASSERT(session, txn->mod_count == 0); txn->notify = NULL; txn_global = &S2C(session)->txn_global; txn_state = WT_SESSION_TXN_STATE(session); was_oldest = 0; /* Clear the transaction's ID from the global table. */ if (WT_SESSION_IS_CHECKPOINT(session)) { WT_ASSERT(session, txn_state->id == WT_TXN_NONE); txn->id = WT_TXN_NONE; /* Clear the global checkpoint transaction IDs. */ txn_global->checkpoint_id = 0; txn_global->checkpoint_pinned = WT_TXN_NONE; } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running)); WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); /* Quick check for the oldest transaction. */ was_oldest = (txn->id == txn_global->last_running); txn->id = WT_TXN_NONE; } /* Free the scratch buffer allocated for logging. */ __wt_logrec_free(session, &txn->logrec); /* Discard any memory from the session's split stash that we can. */ WT_ASSERT(session, session->split_gen == 0); if (session->split_stash_cnt > 0) __wt_split_stash_discard(session); /* * Reset the transaction state to not running and release the snapshot. */ __wt_txn_release_snapshot(session); txn->isolation = session->isolation; /* Ensure the transaction flags are cleared on exit */ txn->flags = 0; /* * When the oldest transaction in the system completes, bump the oldest * ID. This is racy and so not guaranteed, but in practice it keeps * the oldest ID from falling too far behind. */ if (was_oldest) __wt_txn_update_oldest(session, 1); }
/* * __wt_txn_get_snapshot -- * Allocate a snapshot. */ void __wt_txn_get_snapshot(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; uint64_t current_id, id; uint64_t prev_oldest_id, snap_min; uint32_t i, n, session_cnt; int32_t count; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); /* * We're going to scan. Increment the count of scanners to prevent the * oldest ID from moving forwards. Spin if the count is negative, * which indicates that some thread is moving the oldest ID forwards. */ do { if ((count = txn_global->scan_count) < 0) WT_PAUSE(); } while (count < 0 || !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1)); current_id = snap_min = txn_global->current; prev_oldest_id = txn_global->oldest_id; /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { txn_state->snap_min = current_id; __txn_sort_snapshot(session, 0, current_id); /* Check that the oldest ID has not moved in the meantime. */ if (prev_oldest_id == txn_global->oldest_id) { WT_ASSERT(session, txn_global->scan_count > 0); (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); return; } } /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Build our snapshot of any concurrent transaction IDs. * * Ignore: * - Our own ID: we always read our own updates. * - The ID if it is older than the oldest ID we saw. This * can happen if we race with a thread that is allocating * an ID -- the ID will not be used because the thread will * keep spinning until it gets a valid one. */ if (s != txn_state && (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id)) { txn->snapshot[n++] = id; if (WT_TXNID_LT(id, snap_min)) snap_min = id; } } /* * If we got a new snapshot, update the published snap_min for this * session. */ WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, snap_min)); WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->snap_min = snap_min; WT_ASSERT(session, txn_global->scan_count > 0); (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); __txn_sort_snapshot(session, n, current_id); }
/* * __wt_txn_get_snapshot -- * Allocate a snapshot. */ int __wt_txn_get_snapshot(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; uint64_t current_id, id; uint64_t prev_oldest_id, snap_min; uint32_t i, n, session_cnt; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); n = 0; /* * Spin waiting for the lock: the sleeps in our blocking readlock * implementation are too slow for scanning the transaction table. */ while ((ret = __wt_try_readlock(session, txn_global->scan_rwlock)) == EBUSY) WT_PAUSE(); WT_RET(ret); current_id = snap_min = txn_global->current; prev_oldest_id = txn_global->oldest_id; /* * Include the checkpoint transaction, if one is running: we should * ignore any uncommitted changes the checkpoint has written to the * metadata. We don't have to keep the checkpoint's changes pinned so * don't including it in the published snap_min. */ if ((id = txn_global->checkpoint_txnid) != WT_TXN_NONE) txn->snapshot[n++] = id; /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { txn_state->snap_min = current_id; /* Check that the oldest ID has not moved in the meantime. */ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); goto done; } /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Build our snapshot of any concurrent transaction IDs. * * Ignore: * - Our own ID: we always read our own updates. * - The ID if it is older than the oldest ID we saw. This * can happen if we race with a thread that is allocating * an ID -- the ID will not be used because the thread will * keep spinning until it gets a valid one. */ if (s != txn_state && (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id)) { txn->snapshot[n++] = id; if (WT_TXNID_LT(id, snap_min)) snap_min = id; } } /* * If we got a new snapshot, update the published snap_min for this * session. */ WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, snap_min)); WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->snap_min = snap_min; done: __wt_readunlock(session, txn_global->scan_rwlock); __txn_sort_snapshot(session, n, current_id); return (0); }