/* * __wt_txn_begin -- * Begin a transaction. */ int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = &txn_global->states[session->id]; WT_ASSERT(session, txn_state->id == WT_TXN_NONE); WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval)); if (cval.len == 0) txn->isolation = session->isolation; else txn->isolation = WT_STRING_MATCH("snapshot", cval.str, cval.len) ? TXN_ISO_SNAPSHOT : WT_STRING_MATCH("read-committed", cval.str, cval.len) ? TXN_ISO_READ_COMMITTED : TXN_ISO_READ_UNCOMMITTED; /* * Allocate a transaction ID. * * We use an atomic compare and swap to ensure that we get a * unique ID that is published before the global counter is * updated. * * If two threads race to allocate an ID, only the latest ID * will proceed. The winning thread can be sure its snapshot * contains all of the earlier active IDs. Threads that race * and get an earlier ID may not appear in the snapshot, but * they will loop and allocate a new ID before proceeding to * make any updates. * * This potentially wastes transaction IDs when threads race to * begin transactions: that is the price we pay to keep this * path latch free. */ do { txn_state->id = txn->id = txn_global->current; } while (!WT_ATOMIC_CAS(txn_global->current, txn->id, txn->id + 1)); /* * If we have used 64-bits of transaction IDs, there is nothing * more we can do. */ if (txn->id == WT_TXN_ABORTED) WT_RET_MSG(session, ENOMEM, "Out of transaction IDs"); F_SET(txn, TXN_RUNNING); if (txn->isolation == TXN_ISO_SNAPSHOT) __wt_txn_refresh(session, WT_TXN_NONE, 1); return (0); }
/* * __wt_txn_commit -- * Commit the current transaction. */ int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) { WT_DECL_RET; WT_TXN *txn; WT_TXN_OP *op; u_int i; WT_UNUSED(cfg); txn = &session->txn; WT_ASSERT(session, !F_ISSET(txn, TXN_ERROR)); if (!F_ISSET(txn, TXN_RUNNING)) WT_RET_MSG(session, EINVAL, "No transaction is active"); /* Commit notification. */ if (txn->notify != NULL) WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session, txn->id, 1)); /* If we are logging, write a commit log record. */ if (ret == 0 && txn->mod_count > 0 && S2C(session)->logging && !F_ISSET(session, WT_SESSION_LOGGING_DISABLED)) ret = __wt_txn_log_commit(session, cfg); /* * If anything went wrong, roll back. * * !!! * Nothing can fail after this point. */ if (ret != 0) { WT_TRET(__wt_txn_rollback(session, cfg)); return (ret); } /* Free memory associated with updates. */ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) __wt_txn_op_free(session, op); /* * Auto-commit transactions need a new transaction snapshot so that the * committed changes are visible to subsequent reads. However, cursor * keys and values will point to the data that was just modified, so * the snapshot cannot be so new that updates could be freed underneath * the cursor. Get the new snapshot before releasing the ID for the * commit. */ if (session->ncursors > 0 && txn->isolation != TXN_ISO_READ_UNCOMMITTED) __wt_txn_refresh(session, txn->id + 1, 1); __wt_txn_release(session); return (0); }
/* * __wt_txn_update_oldest -- * Sweep the running transactions to update the oldest ID required. */ void __wt_txn_update_oldest(WT_SESSION_IMPL *session) { /* * !!! * If a data-source is calling the WT_EXTENSION_API.transaction_oldest * method (for the oldest transaction ID not yet visible to a running * transaction), and then comparing that oldest ID against committed * transactions to see if updates for a committed transaction are still * visible to running transactions, the oldest transaction ID may be * the same as the last committed transaction ID, if the transaction * state wasn't refreshed after the last transaction committed. Push * past the last committed transaction. */ __wt_txn_refresh(session, WT_TXN_NONE, 0); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, int syncop) { struct timespec end, start; WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, leaf_bytes; uint64_t internal_pages, leaf_pages; uint32_t flags; int evict_reset; btree = S2BT(session); flags = WT_READ_CACHE | WT_READ_NO_GEN; walk = NULL; txn = &session->txn; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) WT_RET(__wt_epoch(session, &start)); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write the hottest pages: checkpoint will have * to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && __wt_txn_visible_all( session, page->modify->update_txn)) { if (txn->isolation == TXN_ISO_READ_COMMITTED) __wt_txn_refresh(session, 1); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; case WT_SYNC_CHECKPOINT: /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * When internal pages are being reconciled by checkpoint their * child pages cannot disappear from underneath them or be split * into them, nor can underlying blocks be freed until the block * lists for the checkpoint are stable. Set the checkpointing * flag to block eviction of dirty pages until the checkpoint's * internal page pass is complete, then wait for any existing * eviction to complete. */ btree->checkpointing = 1; WT_FULL_BARRIER(); WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); if (evict_reset) __wt_evict_file_exclusive_off(session); /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); if (walk == NULL) break; /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. * * We can skip dirty pages if: * (1) they are leaf pages; * (2) there is a snapshot transaction active (which * is the case in ordinary application checkpoints * but not all internal cases); and * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. */ page = walk->page; mod = page->modify; if (__wt_page_is_modified(page) && (WT_PAGE_IS_INTERNAL(page) || !F_ISSET(txn, TXN_HAS_SNAPSHOT) || TXNID_LE(mod->first_dirty_txn, txn->snap_max))) { if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_ERR(__wt_epoch(session, &end)); WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 " bytes, %" PRIu64 " pages of internal\n\t" "Took: %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, WT_TIMEDIFF(end, start) / WT_MILLION)); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); if (txn->isolation == TXN_ISO_READ_COMMITTED && session->ncursors == 0) __wt_txn_release_snapshot(session); if (btree->checkpointing) { /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. */ btree->checkpointing = 0; WT_FULL_BARRIER(); /* * If this tree was being skipped by the eviction server during * the checkpoint, clear the wait. */ btree->evict_walk_period = 0; /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ WT_TRET(__wt_evict_server_wake(session)); } __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES) WT_RET(btree->bm->sync(btree->bm, session, 1)); return (ret); }