/* * __wt_txn_release -- * Release the resources associated with the current transaction. */ void __wt_txn_release(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; txn = &session->txn; txn_global = &S2C(session)->txn_global; txn_state = WT_SESSION_TXN_STATE(session); WT_ASSERT(session, txn->mod_count == 0); txn->notify = NULL; /* Clear the transaction's ID from the global table. */ if (WT_SESSION_IS_CHECKPOINT(session)) { WT_ASSERT(session, txn_state->id == WT_TXN_NONE); txn->id = txn_global->checkpoint_state.id = txn_global->checkpoint_state.pinned_id = WT_TXN_NONE; /* * Be extra careful to cleanup everything for checkpoints: once * the global checkpoint ID is cleared, we can no longer tell * if this session is doing a checkpoint. */ txn_global->checkpoint_id = 0; } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running)); WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); txn->id = WT_TXN_NONE; } __wt_txn_clear_commit_timestamp(session); __wt_txn_clear_read_timestamp(session); /* Free the scratch buffer allocated for logging. */ __wt_logrec_free(session, &txn->logrec); /* Discard any memory from the session's stash that we can. */ WT_ASSERT(session, __wt_session_gen(session, WT_GEN_SPLIT) == 0); __wt_stash_discard(session); /* * Reset the transaction state to not running and release the snapshot. */ __wt_txn_release_snapshot(session); txn->isolation = session->isolation; /* Ensure the transaction flags are cleared on exit */ txn->flags = 0; }
/* * __sync_checkpoint_can_skip -- * There are limited conditions under which we can skip writing a dirty * page during checkpoint. */ static inline bool __sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_MULTI *multi; WT_PAGE_MODIFY *mod; WT_TXN *txn; u_int i; mod = page->modify; txn = &session->txn; /* * We can skip some dirty pages during a checkpoint. The requirements: * * 1. they must be leaf pages, * 2. there is a snapshot transaction active (which is the case in * ordinary application checkpoints but not all internal cases), * 3. the first dirty update on the page is sufficiently recent the * checkpoint transaction would skip them, * 4. there's already an address for every disk block involved. */ if (WT_PAGE_IS_INTERNAL(page)) return (false); if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) return (false); if (!WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) return (false); /* * The problematic case is when a page was evicted but when there were * unresolved updates and not every block associated with the page has * a disk address. We can't skip such pages because we need a checkpoint * write with valid addresses. * * The page's modification information can change underfoot if the page * is being reconciled, so we'd normally serialize with reconciliation * before reviewing page-modification information. However, checkpoint * is the only valid writer of dirty leaf pages at this point, we skip * the lock. */ if (mod->rec_result == WT_PM_REC_MULTIBLOCK) for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) if (multi->addr.addr == NULL) return (false); return (true); }
/* * __wt_txn_release -- * Release the resources associated with the current transaction. */ void __wt_txn_release(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; txn = &session->txn; WT_ASSERT(session, txn->mod_count == 0); txn->notify = NULL; txn_global = &S2C(session)->txn_global; txn_state = WT_SESSION_TXN_STATE(session); /* Clear the transaction's ID from the global table. */ if (WT_SESSION_IS_CHECKPOINT(session)) { WT_ASSERT(session, txn_state->id == WT_TXN_NONE); txn->id = WT_TXN_NONE; /* Clear the global checkpoint transaction IDs. */ txn_global->checkpoint_id = 0; txn_global->checkpoint_pinned = WT_TXN_NONE; } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running)); WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); txn->id = WT_TXN_NONE; } /* Free the scratch buffer allocated for logging. */ __wt_logrec_free(session, &txn->logrec); /* Discard any memory from the session's split stash that we can. */ WT_ASSERT(session, session->split_gen == 0); if (session->split_stash_cnt > 0) __wt_split_stash_discard(session); /* * Reset the transaction state to not running and release the snapshot. */ __wt_txn_release_snapshot(session); txn->isolation = session->isolation; /* Ensure the transaction flags are cleared on exit */ txn->flags = 0; }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *prev, *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_pinned_id, time_start, time_stop; uint32_t flags; bool timer, tried_eviction; conn = S2C(session); btree = S2BT(session); prev = walk = NULL; txn = &session->txn; tried_eviction = false; time_start = time_stop = 0; /* Only visit pages in cache and don't bump page read generations. */ flags = WT_READ_CACHE | WT_READ_NO_GEN; /* * Skip all deleted pages. For a page to be marked deleted, it must * have been evicted from cache and marked clean. Checkpoint should * never instantiate deleted pages: if a truncate is not visible to the * checkpoint, the on-disk version is correct. If the truncate is * visible, we skip over the child page when writing its parent. We * check whether a truncate is visible in the checkpoint as part of * reconciling internal pages (specifically in __rec_child_modify). */ LF_SET(WT_READ_DELETED_SKIP); internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT); if (timer) time_start = __wt_clock(session); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } /* * Save the oldest transaction ID we need to keep around. * Otherwise, in a busy system, we could be updating pages so * fast that write leaves never catches up. We deliberately * have no transaction running at this point that would keep * the oldest ID from moving forwards as we walk the tree. */ oldest_id = __wt_txn_oldest_id(session); LF_SET(WT_READ_NO_WAIT | WT_READ_SKIP_INTL); for (;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write hot pages (defined as pages that have * been updated since the write phase leaves started): * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, WT_REC_CHECKPOINT, NULL)); } } break; case WT_SYNC_CHECKPOINT: /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make * a schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. * We don't update the snapshot in between pages because the * metadata shouldn't have many pages. Instead, read-committed * isolation ensures that all metadata updates completed before * the checkpoint are included. */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * In the final checkpoint pass, child pages cannot be evicted * from underneath internal pages nor can underlying blocks be * freed until the checkpoint's block lists are stable. Also, * we cannot split child pages into parents unless we know the * final pass will write a consistent view of that namespace. * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ WT_ASSERT(session, btree->syncing == WT_BTREE_SYNC_OFF && btree->sync_session == NULL); btree->sync_session = session; btree->syncing = WT_BTREE_SYNC_WAIT; (void)__wt_gen_next_drain(session, WT_GEN_EVICT); btree->syncing = WT_BTREE_SYNC_RUNNING; /* Write all dirty in-cache pages. */ LF_SET(WT_READ_NO_EVICT); /* Read pages with lookaside entries and evict them asap. */ LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED); for (;;) { WT_ERR(__sync_dup_walk(session, walk, flags, &prev)); WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Skip clean pages, but need to make sure maximum * transaction ID is always updated. */ if (!__wt_page_is_modified(walk->page)) { if (((mod = walk->page->modify) != NULL) && mod->rec_max_txn > btree->rec_max_txn) btree->rec_max_txn = mod->rec_max_txn; if (mod != NULL && btree->rec_max_timestamp < mod->rec_max_timestamp) btree->rec_max_timestamp = mod->rec_max_timestamp; continue; } /* * Take a local reference to the page modify structure * now that we know the page is dirty. It needs to be * done in this order otherwise the page modify * structure could have been created between taking the * reference and checking modified. */ page = walk->page; /* * Write dirty pages, if we can't skip them. If we skip * a page, mark the tree dirty. The checkpoint marked it * clean and we can't skip future checkpoints until this * page is written. */ if (__sync_checkpoint_can_skip(session, page)) { __wt_tree_modify_set(session); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } /* * If the page was pulled into cache by our read, try * to evict it now. * * For eviction to have a chance, we first need to move * the walk point to the next page checkpoint will * visit. We want to avoid this code being too special * purpose, so try to reuse the ordinary eviction path. * * Regardless of whether eviction succeeds or fails, * the walk continues from the previous location. We * remember whether we tried eviction, and don't try * again. Even if eviction fails (the page may stay in * cache clean but with history that cannot be * discarded), that is not wasted effort because * checkpoint doesn't need to write the page again. */ if (!WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_WONT_NEED && !tried_eviction) { WT_ERR_BUSY_OK( __wt_page_release_evict(session, walk)); walk = prev; prev = NULL; tried_eviction = true; continue; } tried_eviction = false; WT_ERR(__wt_reconcile( session, walk, NULL, WT_REC_CHECKPOINT, NULL)); /* * Update checkpoint IO tracking data if configured * to log verbose progress messages. */ if (conn->ckpt_timer_start.tv_sec > 0) { conn->ckpt_write_bytes += page->memory_footprint; ++conn->ckpt_write_pages; /* Periodically log checkpoint progress. */ if (conn->ckpt_write_pages % 5000 == 0) __wt_checkpoint_progress( session, false); } } break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: WT_ERR(__wt_illegal_value(session, syncop)); break; } if (timer) { time_stop = __wt_clock(session); __wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote: %" PRIu64 " leaf pages (%" PRIu64 "B), %" PRIu64 " internal pages (%" PRIu64 "B), and took %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_pages, leaf_bytes, internal_pages, internal_bytes, WT_CLOCKDIFF_MS(time_stop, time_start)); } err: /* On error, clear any left-over tree walk. */ WT_TRET(__wt_page_release(session, walk, flags)); WT_TRET(__wt_page_release(session, prev, flags)); /* * If we got a snapshot in order to write pages, and there was no * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && saved_pinned_id == WT_TXN_NONE) __wt_txn_release_snapshot(session); /* Clear the checkpoint flag. */ btree->syncing = WT_BTREE_SYNC_OFF; btree->sync_session = NULL; __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, false)); return (ret); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, int syncop) { struct timespec end, start; WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, leaf_bytes; uint64_t internal_pages, leaf_pages; uint32_t flags; bool evict_reset; btree = S2BT(session); flags = WT_READ_CACHE | WT_READ_NO_GEN; walk = NULL; txn = &session->txn; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) WT_RET(__wt_epoch(session, &start)); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write the hottest pages: checkpoint will have * to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && __wt_txn_visible_all( session, page->modify->update_txn)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; case WT_SYNC_CHECKPOINT: /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * When internal pages are being reconciled by checkpoint their * child pages cannot disappear from underneath them or be split * into them, nor can underlying blocks be freed until the block * lists for the checkpoint are stable. Set the checkpointing * flag to block eviction of dirty pages until the checkpoint's * internal page pass is complete, then wait for any existing * eviction to complete. */ btree->checkpointing = 1; WT_FULL_BARRIER(); WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); if (evict_reset) __wt_evict_file_exclusive_off(session); /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { /* * If we have a page, and it was ever modified, track * the highest transaction ID in the tree. We do this * here because we want the value after reconciling * dirty pages. */ if (walk != NULL && walk->page != NULL && (mod = walk->page->modify) != NULL && WT_TXNID_LT(btree->rec_max_txn, mod->rec_max_txn)) btree->rec_max_txn = mod->rec_max_txn; WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); if (walk == NULL) break; page = walk->page; mod = page->modify; /* Skip clean pages. */ if (!__wt_page_is_modified(page)) continue; /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. * * We can skip dirty pages if: * (1) they are leaf pages; * (2) there is a snapshot transaction active (which * is the case in ordinary application checkpoints * but not all internal cases); and * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. * * Mark the tree dirty: the checkpoint marked it clean * and we can't skip future checkpoints until this page * is written. */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn) && mod->rec_result != WT_PM_REC_REWRITE) { __wt_page_modify_set(session, page); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } break; } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_ERR(__wt_epoch(session, &end)); WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 " bytes, %" PRIu64 " pages of internal\n\t" "Took: %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, WT_TIMEDIFF(end, start) / WT_MILLION)); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); if (txn->isolation == WT_ISO_READ_COMMITTED && session->ncursors == 0) __wt_txn_release_snapshot(session); if (btree->checkpointing) { /* * Update the checkpoint generation for this handle so visible * updates newer than the checkpoint can be evicted. * * This has to be published before eviction is enabled again, * so that eviction knows that the checkpoint has completed. */ WT_PUBLISH(btree->checkpoint_gen, S2C(session)->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. */ btree->checkpointing = 0; WT_FULL_BARRIER(); /* * If this tree was being skipped by the eviction server during * the checkpoint, clear the wait. */ btree->evict_walk_period = 0; /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ WT_TRET(__wt_evict_server_wake(session)); } __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES) WT_RET(btree->bm->sync(btree->bm, session, true)); return (ret); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { struct timespec end, start; WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_snap_min; uint32_t flags; conn = S2C(session); btree = S2BT(session); walk = NULL; txn = &session->txn; saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min; flags = WT_READ_CACHE | WT_READ_NO_GEN; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) WT_RET(__wt_epoch(session, &start)); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } /* * Save the oldest transaction ID we need to keep around. * Otherwise, in a busy system, we could be updating pages so * fast that write leaves never catches up. We deliberately * have no transaction running at this point that would keep * the oldest ID from moving forwards as we walk the tree. */ oldest_id = __wt_txn_oldest_id(session); flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write hot pages (defined as pages that have * been updated since the write phase leaves started): * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; case WT_SYNC_CHECKPOINT: /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make * schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. * We don't update the snapshot in between pages because (a) * the metadata shouldn't be that big, and (b) if we do ever */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * In the final checkpoint pass, child pages cannot be evicted * from underneath internal pages nor can underlying blocks be * freed until the checkpoint's block lists are stable. Also, * we cannot split child pages into parents unless we know the * final pass will write a consistent view of that namespace. * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE); WT_ERR(__wt_evict_file_exclusive_on(session)); __wt_evict_file_exclusive_off(session); WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING); /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* Skip clean pages. */ if (!__wt_page_is_modified(walk->page)) continue; /* * Take a local reference to the page modify structure * now that we know the page is dirty. It needs to be * done in this order otherwise the page modify * structure could have been created between taking the * reference and checking modified. */ page = walk->page; mod = page->modify; /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. * * We can skip dirty pages if: * (1) they are leaf pages; * (2) there is a snapshot transaction active (which * is the case in ordinary application checkpoints * but not all internal cases); and * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. * * Mark the tree dirty: the checkpoint marked it clean * and we can't skip future checkpoints until this page * is written. */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { __wt_page_modify_set(session, page); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: WT_ILLEGAL_VALUE_ERR(session); } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_ERR(__wt_epoch(session, &end)); WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 " bytes, %" PRIu64 " pages of internal\n\t" "Took: %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, WT_TIMEDIFF_MS(end, start))); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); /* * If we got a snapshot in order to write pages, and there was no * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && saved_snap_min == WT_TXN_NONE) __wt_txn_release_snapshot(session); if (btree->checkpointing != WT_CKPT_OFF) { /* * Update the checkpoint generation for this handle so visible * updates newer than the checkpoint can be evicted. * * This has to be published before eviction is enabled again, * so that eviction knows that the checkpoint has completed. */ WT_PUBLISH(btree->checkpoint_gen, conn->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. */ btree->checkpointing = WT_CKPT_OFF; WT_FULL_BARRIER(); /* * If this tree was being skipped by the eviction server during * the checkpoint, clear the wait. */ btree->evict_walk_period = 0; /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ WT_TRET(__wt_evict_server_wake(session)); } __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, true)); return (ret); }
/* * __wt_txn_release -- * Release the resources associated with the current transaction. */ void __wt_txn_release(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; int was_oldest; txn = &session->txn; WT_ASSERT(session, txn->mod_count == 0); txn->notify = NULL; txn_global = &S2C(session)->txn_global; txn_state = WT_SESSION_TXN_STATE(session); was_oldest = 0; /* Clear the transaction's ID from the global table. */ if (WT_SESSION_IS_CHECKPOINT(session)) { WT_ASSERT(session, txn_state->id == WT_TXN_NONE); txn->id = WT_TXN_NONE; /* Clear the global checkpoint transaction IDs. */ txn_global->checkpoint_id = 0; txn_global->checkpoint_pinned = WT_TXN_NONE; } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running)); WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); /* Quick check for the oldest transaction. */ was_oldest = (txn->id == txn_global->last_running); txn->id = WT_TXN_NONE; } /* Free the scratch buffer allocated for logging. */ __wt_logrec_free(session, &txn->logrec); /* Discard any memory from the session's split stash that we can. */ WT_ASSERT(session, session->split_gen == 0); if (session->split_stash_cnt > 0) __wt_split_stash_discard(session); /* * Reset the transaction state to not running and release the snapshot. */ __wt_txn_release_snapshot(session); txn->isolation = session->isolation; /* Ensure the transaction flags are cleared on exit */ txn->flags = 0; /* * When the oldest transaction in the system completes, bump the oldest * ID. This is racy and so not guaranteed, but in practice it keeps * the oldest ID from falling too far behind. */ if (was_oldest) __wt_txn_update_oldest(session, 1); }
/* * __wt_txn_update_oldest -- * Sweep the running transactions to update the oldest ID required. * !!! * If a data-source is calling the WT_EXTENSION_API.transaction_oldest * method (for the oldest transaction ID not yet visible to a running * transaction), and then comparing that oldest ID against committed * transactions to see if updates for a committed transaction are still * visible to running transactions, the oldest transaction ID may be * the same as the last committed transaction ID, if the transaction * state wasn't refreshed after the last transaction committed. Push * past the last committed transaction. */ void __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) { WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; uint64_t current_id, id, last_running, oldest_id, prev_oldest_id; uint32_t i, session_cnt; int32_t count; int last_running_moved; conn = S2C(session); txn_global = &conn->txn_global; current_id = last_running = txn_global->current; oldest_session = NULL; prev_oldest_id = txn_global->oldest_id; /* * For pure read-only workloads, or if the update isn't forced and the * oldest ID isn't too far behind, avoid scanning. */ if (prev_oldest_id == current_id || (!force && WT_TXNID_LT(current_id, prev_oldest_id + 100))) return; /* * We're going to scan. Increment the count of scanners to prevent the * oldest ID from moving forwards. Spin if the count is negative, * which indicates that some thread is moving the oldest ID forwards. */ do { if ((count = txn_global->scan_count) < 0) WT_PAUSE(); } while (count < 0 || !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1)); /* The oldest ID cannot change until the scan count goes to zero. */ prev_oldest_id = txn_global->oldest_id; current_id = oldest_id = last_running = txn_global->current; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Update the oldest ID. * * Ignore: IDs older than the oldest ID we saw. This can happen * if we race with a thread that is allocating an ID -- the ID * will not be used because the thread will keep spinning until * it gets a valid one. */ if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && WT_TXNID_LT(id, last_running)) last_running = id; /* * !!! * Note: Don't ignore snap_min values older than the previous * oldest ID. Read-uncommitted operations publish snap_min * values without incrementing scan_count to protect the global * table. See the comment in __wt_txn_cursor_op for * more details. */ if ((id = s->snap_min) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) { oldest_id = id; oldest_session = &conn->sessions[i]; } } if (WT_TXNID_LT(last_running, oldest_id)) oldest_id = last_running; /* The oldest ID can't move past any named snapshots. */ if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; /* Update the last running ID. */ last_running_moved = WT_TXNID_LT(txn_global->last_running, last_running); /* Update the oldest ID. */ if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) && WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) { WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, last_running)) last_running = id; if ((id = s->snap_min) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; } if (WT_TXNID_LT(last_running, oldest_id)) oldest_id = last_running; #ifdef HAVE_DIAGNOSTIC /* * Make sure the ID doesn't move past any named snapshots. * * Don't include the read/assignment in the assert statement. * Coverity complains if there are assignments only done in * diagnostic builds, and when the read is from a volatile. */ id = txn_global->nsnap_oldest_id; WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); #endif if (WT_TXNID_LT(txn_global->last_running, last_running)) txn_global->last_running = last_running; if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; WT_ASSERT(session, txn_global->scan_count == -1); txn_global->scan_count = 0; } else { if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && current_id - oldest_id > 10000 && last_running_moved && oldest_session != NULL) { (void)__wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %d [%s]" " with snap_min %" PRIu64 "\n", oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min); } WT_ASSERT(session, txn_global->scan_count > 0); (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); } }
/* * __wt_txn_get_snapshot -- * Allocate a snapshot. */ void __wt_txn_get_snapshot(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; uint64_t current_id, id; uint64_t prev_oldest_id, snap_min; uint32_t i, n, session_cnt; int32_t count; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); /* * We're going to scan. Increment the count of scanners to prevent the * oldest ID from moving forwards. Spin if the count is negative, * which indicates that some thread is moving the oldest ID forwards. */ do { if ((count = txn_global->scan_count) < 0) WT_PAUSE(); } while (count < 0 || !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1)); current_id = snap_min = txn_global->current; prev_oldest_id = txn_global->oldest_id; /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { txn_state->snap_min = current_id; __txn_sort_snapshot(session, 0, current_id); /* Check that the oldest ID has not moved in the meantime. */ if (prev_oldest_id == txn_global->oldest_id) { WT_ASSERT(session, txn_global->scan_count > 0); (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); return; } } /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Build our snapshot of any concurrent transaction IDs. * * Ignore: * - Our own ID: we always read our own updates. * - The ID if it is older than the oldest ID we saw. This * can happen if we race with a thread that is allocating * an ID -- the ID will not be used because the thread will * keep spinning until it gets a valid one. */ if (s != txn_state && (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id)) { txn->snapshot[n++] = id; if (WT_TXNID_LT(id, snap_min)) snap_min = id; } } /* * If we got a new snapshot, update the published snap_min for this * session. */ WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, snap_min)); WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->snap_min = snap_min; WT_ASSERT(session, txn_global->scan_count > 0); (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); __txn_sort_snapshot(session, n, current_id); }
/* * __wt_txn_update_oldest -- * Sweep the running transactions to update the oldest ID required. */ int __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; uint64_t current_id, last_running, oldest_id; uint64_t prev_last_running, prev_oldest_id; bool strict, wait; conn = S2C(session); txn_global = &conn->txn_global; strict = LF_ISSET(WT_TXN_OLDEST_STRICT); wait = LF_ISSET(WT_TXN_OLDEST_WAIT); current_id = last_running = txn_global->current; prev_last_running = txn_global->last_running; prev_oldest_id = txn_global->oldest_id; /* * For pure read-only workloads, or if the update isn't forced and the * oldest ID isn't too far behind, avoid scanning. */ if (prev_oldest_id == current_id || (!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100))) return (0); /* First do a read-only scan. */ if (wait) __wt_readlock(session, txn_global->scan_rwlock); else if ((ret = __wt_try_readlock(session, txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session); __wt_readunlock(session, txn_global->scan_rwlock); /* * If the state hasn't changed (or hasn't moved far enough for * non-forced updates), give up. */ if ((oldest_id == prev_oldest_id || (!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) && ((last_running == prev_last_running) || (!strict && WT_TXNID_LT(last_running, prev_last_running + 100)))) return (0); /* It looks like an update is necessary, wait for exclusive access. */ if (wait) __wt_writelock(session, txn_global->scan_rwlock); else if ((ret = __wt_try_writelock(session, txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); /* * If the oldest ID has been updated while we waited, don't bother * scanning. */ if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) && WT_TXNID_LE(last_running, txn_global->last_running)) goto done; /* * Re-scan now that we have exclusive access. This is necessary because * threads get transaction snapshots with read locks, and we have to be * sure that there isn't a thread that has got a snapshot locally but * not yet published its snap_min. */ __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session); #ifdef HAVE_DIAGNOSTIC { /* * Make sure the ID doesn't move past any named snapshots. * * Don't include the read/assignment in the assert statement. Coverity * complains if there are assignments only done in diagnostic builds, * and when the read is from a volatile. */ uint64_t id = txn_global->nsnap_oldest_id; WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); } #endif /* Update the oldest ID. */ if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; if (WT_TXNID_LT(txn_global->last_running, last_running)) { txn_global->last_running = last_running; #ifdef HAVE_VERBOSE /* Output a verbose message about long-running transactions, * but only when some progress is being made. */ if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && current_id - oldest_id > 10000 && oldest_session != NULL) { __wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %" PRIu32 " [%s]" " with snap_min %" PRIu64 "\n", oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min); } #endif } done: __wt_writeunlock(session, txn_global->scan_rwlock); return (ret); }
/* * __txn_oldest_scan -- * Sweep the running transactions to calculate the oldest ID required. */ static void __txn_oldest_scan(WT_SESSION_IMPL *session, uint64_t *oldest_idp, uint64_t *last_runningp, WT_SESSION_IMPL **oldest_sessionp) { WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; uint64_t id, last_running, oldest_id, prev_oldest_id; uint32_t i, session_cnt; conn = S2C(session); txn_global = &conn->txn_global; oldest_session = NULL; /* The oldest ID cannot change while we are holding the scan lock. */ prev_oldest_id = txn_global->oldest_id; oldest_id = last_running = txn_global->current; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Update the oldest ID. * * Ignore: IDs older than the oldest ID we saw. This can happen * if we race with a thread that is allocating an ID -- the ID * will not be used because the thread will keep spinning until * it gets a valid one. */ if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && WT_TXNID_LT(id, last_running)) last_running = id; /* * !!! * Note: Don't ignore snap_min values older than the previous * oldest ID. Read-uncommitted operations publish snap_min * values without acquiring the scan lock to protect the global * table. See the comment in __wt_txn_cursor_op for * more details. */ if ((id = s->snap_min) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) { oldest_id = id; oldest_session = &conn->sessions[i]; } } if (WT_TXNID_LT(last_running, oldest_id)) oldest_id = last_running; /* The oldest ID can't move past any named snapshots. */ if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; *oldest_idp = oldest_id; *oldest_sessionp = oldest_session; *last_runningp = last_running; }
/* * __wt_txn_get_snapshot -- * Allocate a snapshot. */ int __wt_txn_get_snapshot(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; uint64_t current_id, id; uint64_t prev_oldest_id, snap_min; uint32_t i, n, session_cnt; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); n = 0; /* * Spin waiting for the lock: the sleeps in our blocking readlock * implementation are too slow for scanning the transaction table. */ while ((ret = __wt_try_readlock(session, txn_global->scan_rwlock)) == EBUSY) WT_PAUSE(); WT_RET(ret); current_id = snap_min = txn_global->current; prev_oldest_id = txn_global->oldest_id; /* * Include the checkpoint transaction, if one is running: we should * ignore any uncommitted changes the checkpoint has written to the * metadata. We don't have to keep the checkpoint's changes pinned so * don't including it in the published snap_min. */ if ((id = txn_global->checkpoint_txnid) != WT_TXN_NONE) txn->snapshot[n++] = id; /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { txn_state->snap_min = current_id; /* Check that the oldest ID has not moved in the meantime. */ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); goto done; } /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Build our snapshot of any concurrent transaction IDs. * * Ignore: * - Our own ID: we always read our own updates. * - The ID if it is older than the oldest ID we saw. This * can happen if we race with a thread that is allocating * an ID -- the ID will not be used because the thread will * keep spinning until it gets a valid one. */ if (s != txn_state && (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id)) { txn->snapshot[n++] = id; if (WT_TXNID_LT(id, snap_min)) snap_min = id; } } /* * If we got a new snapshot, update the published snap_min for this * session. */ WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, snap_min)); WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->snap_min = snap_min; done: __wt_readunlock(session, txn_global->scan_rwlock); __txn_sort_snapshot(session, n, current_id); return (0); }
/* * __txn_oldest_scan -- * Sweep the running transactions to calculate the oldest ID required. */ static void __txn_oldest_scan(WT_SESSION_IMPL *session, uint64_t *oldest_idp, uint64_t *last_runningp, uint64_t *metadata_pinnedp, WT_SESSION_IMPL **oldest_sessionp) { WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; uint64_t id, last_running, metadata_pinned, oldest_id, prev_oldest_id; uint32_t i, session_cnt; conn = S2C(session); txn_global = &conn->txn_global; oldest_session = NULL; /* The oldest ID cannot change while we are holding the scan lock. */ prev_oldest_id = txn_global->oldest_id; last_running = oldest_id = txn_global->current; if ((metadata_pinned = txn_global->checkpoint_state.id) == WT_TXN_NONE) metadata_pinned = oldest_id; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* Update the last running transaction ID. */ if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && WT_TXNID_LT(id, last_running)) last_running = id; /* Update the metadata pinned ID. */ if ((id = s->metadata_pinned) != WT_TXN_NONE && WT_TXNID_LT(id, metadata_pinned)) metadata_pinned = id; /* * !!! * Note: Don't ignore pinned ID values older than the previous * oldest ID. Read-uncommitted operations publish pinned ID * values without acquiring the scan lock to protect the global * table. See the comment in __wt_txn_cursor_op for more * details. */ if ((id = s->pinned_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) { oldest_id = id; oldest_session = &conn->sessions[i]; } } if (WT_TXNID_LT(last_running, oldest_id)) oldest_id = last_running; /* The oldest ID can't move past any named snapshots. */ if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; /* The metadata pinned ID can't move past the oldest ID. */ if (WT_TXNID_LT(oldest_id, metadata_pinned)) metadata_pinned = oldest_id; *last_runningp = last_running; *metadata_pinnedp = metadata_pinned; *oldest_idp = oldest_id; *oldest_sessionp = oldest_session; }
/* * __clsm_enter_update -- * Make sure an LSM cursor is ready to perform an update. */ static int __clsm_enter_update(WT_CURSOR_LSM *clsm) { WT_CURSOR *primary; WT_LSM_CHUNK *primary_chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; bool hard_limit, have_primary, ovfl; lsm_tree = clsm->lsm_tree; ovfl = false; session = (WT_SESSION_IMPL *)clsm->iface.session; if (clsm->nchunks == 0) { primary = NULL; have_primary = false; } else { primary = clsm->cursors[clsm->nchunks - 1]; primary_chunk = clsm->primary_chunk; WT_ASSERT(session, F_ISSET(&session->txn, WT_TXN_HAS_ID)); have_primary = (primary != NULL && primary_chunk != NULL && (primary_chunk->switch_txn == WT_TXN_NONE || WT_TXNID_LT(session->txn.id, primary_chunk->switch_txn))); } /* * In LSM there are multiple btrees active at one time. The tree * switch code needs to use btree API methods, and it wants to * operate on the btree for the primary chunk. Set that up now. * * If the primary chunk has grown too large, set a flag so the worker * thread will switch when it gets a chance to avoid introducing high * latency into application threads. Don't do this indefinitely: if a * chunk grows twice as large as the configured size, block until it * can be switched. */ hard_limit = F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); if (have_primary) { WT_ENTER_PAGE_INDEX(session); WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree, ovfl = __wt_btree_lsm_over_size(session, hard_limit ? 2 * lsm_tree->chunk_size : lsm_tree->chunk_size)); WT_LEAVE_PAGE_INDEX(session); /* If there was no overflow, we're done. */ if (!ovfl) return (0); } /* Request a switch. */ WT_RET(__wt_clsm_request_switch(clsm)); /* If we only overflowed the soft limit, we're done. */ if (have_primary && !hard_limit) return (0); WT_RET(__wt_clsm_await_switch(clsm)); return (0); }
/* * __clsm_enter -- * Start an operation on an LSM cursor, update if the tree has changed. */ static inline int __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) { WT_DECL_RET; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; uint64_t *switch_txnp; uint64_t snap_min; lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; txn = &session->txn; /* Merge cursors never update. */ if (F_ISSET(clsm, WT_CLSM_MERGE)) return (0); if (reset) { WT_ASSERT(session, !F_ISSET(&clsm->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT)); WT_RET(__clsm_reset_cursors(clsm, NULL)); } for (;;) { /* * If the cursor looks up-to-date, check if the cache is full. * In case this call blocks, the check will be repeated before * proceeding. */ if (clsm->dsk_gen != lsm_tree->dsk_gen && lsm_tree->nchunks != 0) goto open; if (clsm->dsk_gen != lsm_tree->dsk_gen && lsm_tree->nchunks != 0) goto open; /* Update the maximum transaction ID in the primary chunk. */ if (update) { /* * Ensure that there is a transaction snapshot active. */ WT_RET(__wt_txn_autocommit_check(session)); WT_RET(__wt_txn_id_check(session)); WT_RET(__clsm_enter_update(clsm)); if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) goto open; if (txn->isolation == WT_ISO_SNAPSHOT) __wt_txn_cursor_op(session); /* * Figure out how many updates are required for * snapshot isolation. * * This is not a normal visibility check on the maximum * transaction ID in each chunk: any transaction ID * that overlaps with our snapshot is a potential * conflict. */ clsm->nupdates = 1; if (txn->isolation == WT_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)); snap_min = txn->snap_min; for (switch_txnp = &clsm->switch_txn[clsm->nchunks - 2]; clsm->nupdates < clsm->nchunks; clsm->nupdates++, switch_txnp--) { if (WT_TXNID_LT(*switch_txnp, snap_min)) break; WT_ASSERT(session, !__wt_txn_visible_all( session, *switch_txnp)); } } } /* * Stop when we are up-to-date, as long as this is: * - a snapshot isolation update and the cursor is set up for * that; * - an update operation with a primary chunk, or * - a read operation and the cursor is open for reading. */ if ((!update || txn->isolation != WT_ISO_SNAPSHOT || F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) && ((update && clsm->primary_chunk != NULL) || (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ)))) break; open: WT_WITH_SCHEMA_LOCK(session, ret = __clsm_open_cursors(clsm, update, 0, 0)); WT_RET(ret); } if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) { WT_RET(__cursor_enter(session)); F_SET(clsm, WT_CLSM_ACTIVE); } return (0); }