/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *prev, *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_pinned_id, time_start, time_stop; uint32_t flags; bool timer, tried_eviction; conn = S2C(session); btree = S2BT(session); prev = walk = NULL; txn = &session->txn; tried_eviction = false; time_start = time_stop = 0; /* Only visit pages in cache and don't bump page read generations. */ flags = WT_READ_CACHE | WT_READ_NO_GEN; /* * Skip all deleted pages. For a page to be marked deleted, it must * have been evicted from cache and marked clean. Checkpoint should * never instantiate deleted pages: if a truncate is not visible to the * checkpoint, the on-disk version is correct. If the truncate is * visible, we skip over the child page when writing its parent. We * check whether a truncate is visible in the checkpoint as part of * reconciling internal pages (specifically in __rec_child_modify). */ LF_SET(WT_READ_DELETED_SKIP); internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT); if (timer) time_start = __wt_clock(session); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } /* * Save the oldest transaction ID we need to keep around. * Otherwise, in a busy system, we could be updating pages so * fast that write leaves never catches up. We deliberately * have no transaction running at this point that would keep * the oldest ID from moving forwards as we walk the tree. */ oldest_id = __wt_txn_oldest_id(session); LF_SET(WT_READ_NO_WAIT | WT_READ_SKIP_INTL); for (;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write hot pages (defined as pages that have * been updated since the write phase leaves started): * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, WT_REC_CHECKPOINT, NULL)); } } break; case WT_SYNC_CHECKPOINT: /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make * a schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. * We don't update the snapshot in between pages because the * metadata shouldn't have many pages. Instead, read-committed * isolation ensures that all metadata updates completed before * the checkpoint are included. */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * In the final checkpoint pass, child pages cannot be evicted * from underneath internal pages nor can underlying blocks be * freed until the checkpoint's block lists are stable. Also, * we cannot split child pages into parents unless we know the * final pass will write a consistent view of that namespace. * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ WT_ASSERT(session, btree->syncing == WT_BTREE_SYNC_OFF && btree->sync_session == NULL); btree->sync_session = session; btree->syncing = WT_BTREE_SYNC_WAIT; (void)__wt_gen_next_drain(session, WT_GEN_EVICT); btree->syncing = WT_BTREE_SYNC_RUNNING; /* Write all dirty in-cache pages. */ LF_SET(WT_READ_NO_EVICT); /* Read pages with lookaside entries and evict them asap. */ LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED); for (;;) { WT_ERR(__sync_dup_walk(session, walk, flags, &prev)); WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Skip clean pages, but need to make sure maximum * transaction ID is always updated. */ if (!__wt_page_is_modified(walk->page)) { if (((mod = walk->page->modify) != NULL) && mod->rec_max_txn > btree->rec_max_txn) btree->rec_max_txn = mod->rec_max_txn; if (mod != NULL && btree->rec_max_timestamp < mod->rec_max_timestamp) btree->rec_max_timestamp = mod->rec_max_timestamp; continue; } /* * Take a local reference to the page modify structure * now that we know the page is dirty. It needs to be * done in this order otherwise the page modify * structure could have been created between taking the * reference and checking modified. */ page = walk->page; /* * Write dirty pages, if we can't skip them. If we skip * a page, mark the tree dirty. The checkpoint marked it * clean and we can't skip future checkpoints until this * page is written. */ if (__sync_checkpoint_can_skip(session, page)) { __wt_tree_modify_set(session); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } /* * If the page was pulled into cache by our read, try * to evict it now. * * For eviction to have a chance, we first need to move * the walk point to the next page checkpoint will * visit. We want to avoid this code being too special * purpose, so try to reuse the ordinary eviction path. * * Regardless of whether eviction succeeds or fails, * the walk continues from the previous location. We * remember whether we tried eviction, and don't try * again. Even if eviction fails (the page may stay in * cache clean but with history that cannot be * discarded), that is not wasted effort because * checkpoint doesn't need to write the page again. */ if (!WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_WONT_NEED && !tried_eviction) { WT_ERR_BUSY_OK( __wt_page_release_evict(session, walk)); walk = prev; prev = NULL; tried_eviction = true; continue; } tried_eviction = false; WT_ERR(__wt_reconcile( session, walk, NULL, WT_REC_CHECKPOINT, NULL)); /* * Update checkpoint IO tracking data if configured * to log verbose progress messages. */ if (conn->ckpt_timer_start.tv_sec > 0) { conn->ckpt_write_bytes += page->memory_footprint; ++conn->ckpt_write_pages; /* Periodically log checkpoint progress. */ if (conn->ckpt_write_pages % 5000 == 0) __wt_checkpoint_progress( session, false); } } break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: WT_ERR(__wt_illegal_value(session, syncop)); break; } if (timer) { time_stop = __wt_clock(session); __wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote: %" PRIu64 " leaf pages (%" PRIu64 "B), %" PRIu64 " internal pages (%" PRIu64 "B), and took %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_pages, leaf_bytes, internal_pages, internal_bytes, WT_CLOCKDIFF_MS(time_stop, time_start)); } err: /* On error, clear any left-over tree walk. */ WT_TRET(__wt_page_release(session, walk, flags)); WT_TRET(__wt_page_release(session, prev, flags)); /* * If we got a snapshot in order to write pages, and there was no * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && saved_pinned_id == WT_TXN_NONE) __wt_txn_release_snapshot(session); /* Clear the checkpoint flag. */ btree->syncing = WT_BTREE_SYNC_OFF; btree->sync_session = NULL; __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, false)); return (ret); }
/* * __wt_log_slot_join -- * Join a consolidated logging slot. */ void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *slot; uint64_t time_start, time_stop, usecs; int64_t flag_state, new_state, old_state, released; int32_t join_offset, new_join, wait_cnt; bool closed, diag_yield, raced, slept, unbuffered, yielded; conn = S2C(session); log = conn->log; time_start = time_stop = 0; WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); WT_ASSERT(session, mysize != 0); /* * There should almost always be a slot open. */ unbuffered = yielded = false; closed = raced = slept = false; wait_cnt = 0; #ifdef HAVE_DIAGNOSTIC diag_yield = (++log->write_calls % 7) == 0; if ((log->write_calls % WT_THOUSAND) == 0 || mysize > WT_LOG_SLOT_BUF_MAX) { #else diag_yield = false; if (mysize > WT_LOG_SLOT_BUF_MAX) { #endif unbuffered = true; F_SET(myslot, WT_MYSLOT_UNBUFFERED); } for (;;) { WT_BARRIER(); slot = log->active_slot; old_state = slot->slot_state; if (WT_LOG_SLOT_OPEN(old_state)) { /* * Try to join our size into the existing size and * atomically write it back into the state. */ flag_state = WT_LOG_SLOT_FLAGS(old_state); released = WT_LOG_SLOT_RELEASED(old_state); join_offset = WT_LOG_SLOT_JOINED(old_state); if (unbuffered) new_join = join_offset + WT_LOG_SLOT_UNBUFFERED; else new_join = join_offset + (int32_t)mysize; new_state = (int64_t)WT_LOG_SLOT_JOIN_REL( (int64_t)new_join, (int64_t)released, (int64_t)flag_state); /* * Braces used due to potential empty body warning. */ if (diag_yield) { WT_DIAGNOSTIC_YIELD; } /* * Attempt to swap our size into the state. */ if (__wt_atomic_casiv64( &slot->slot_state, old_state, new_state)) break; WT_STAT_CONN_INCR(session, log_slot_races); raced = true; } else { WT_STAT_CONN_INCR(session, log_slot_active_closed); closed = true; ++wait_cnt; } if (!yielded) time_start = __wt_clock(session); yielded = true; /* * The slot is no longer open or we lost the race to * update it. Yield and try again. */ if (wait_cnt < WT_THOUSAND) __wt_yield(); else { __wt_sleep(0, WT_THOUSAND); slept = true; } } /* * We joined this slot. Fill in our information to return to * the caller. */ if (!yielded) WT_STAT_CONN_INCR(session, log_slot_immediate); else { WT_STAT_CONN_INCR(session, log_slot_yield); time_stop = __wt_clock(session); usecs = WT_CLOCKDIFF_US(time_stop, time_start); WT_STAT_CONN_INCRV(session, log_slot_yield_duration, usecs); if (closed) WT_STAT_CONN_INCR(session, log_slot_yield_close); if (raced) WT_STAT_CONN_INCR(session, log_slot_yield_race); if (slept) WT_STAT_CONN_INCR(session, log_slot_yield_sleep); } if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) F_SET(slot, WT_SLOT_SYNC_DIR); if (LF_ISSET(WT_LOG_FLUSH)) F_SET(slot, WT_SLOT_FLUSH); if (LF_ISSET(WT_LOG_FSYNC)) F_SET(slot, WT_SLOT_SYNC); if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) { WT_ASSERT(session, slot->slot_unbuffered == 0); WT_STAT_CONN_INCR(session, log_slot_unbuffered); slot->slot_unbuffered = (int64_t)mysize; } myslot->slot = slot; myslot->offset = join_offset; myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize); } /* * __wt_log_slot_release -- * Each thread in a consolidated group releases its portion to * signal it has completed copying its piece of the log into * the memory buffer. */ int64_t __wt_log_slot_release(WT_MYSLOT *myslot, int64_t size) { WT_LOGSLOT *slot; wt_off_t cur_offset, my_start; int64_t my_size, rel_size; slot = myslot->slot; my_start = slot->slot_start_offset + myslot->offset; /* * We maintain the last starting offset within this slot. * This is used to know the offset of the last record that * was written rather than the beginning record of the slot. */ while ((cur_offset = slot->slot_last_offset) < my_start) { /* * Set our offset if we are larger. */ if (__wt_atomic_casiv64( &slot->slot_last_offset, cur_offset, my_start)) break; /* * If we raced another thread updating this, try again. */ WT_BARRIER(); } /* * Add my size into the state and return the new size. */ rel_size = size; if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) rel_size = WT_LOG_SLOT_UNBUFFERED; my_size = (int64_t)WT_LOG_SLOT_JOIN_REL((int64_t)0, rel_size, 0); return (__wt_atomic_addiv64(&slot->slot_state, my_size)); }
/* * __log_slot_close -- * Close out the slot the caller is using. The slot may already be * closed or freed by another thread. */ static int __log_slot_close( WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *releasep, bool forced) { WT_CONNECTION_IMPL *conn; WT_LOG *log; int64_t end_offset, new_state, old_state; #ifdef HAVE_DIAGNOSTIC uint64_t time_start, time_stop; int count; #endif *releasep = false; WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); conn = S2C(session); log = conn->log; if (slot == NULL) return (WT_NOTFOUND); retry: old_state = slot->slot_state; /* * If this close is coming from a forced close and a thread is in * the middle of using the slot, return EBUSY. The caller can * decide if retrying is necessary or not. */ if (forced && WT_LOG_SLOT_INPROGRESS(old_state)) return (__wt_set_return(session, EBUSY)); /* * If someone else is switching out this slot we lost. Nothing to * do but return. Return WT_NOTFOUND anytime the given slot was * processed by another closing thread. Only return 0 when we * actually closed the slot. */ if (WT_LOG_SLOT_CLOSED(old_state)) { WT_STAT_CONN_INCR(session, log_slot_close_race); return (WT_NOTFOUND); } /* * If someone completely processed this slot, we're done. */ if (FLD_LOG_SLOT_ISSET( (uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED)) { WT_STAT_CONN_INCR(session, log_slot_close_race); return (WT_NOTFOUND); } new_state = (old_state | WT_LOG_SLOT_CLOSE); /* * Close this slot. If we lose the race retry. */ if (!__wt_atomic_casiv64(&slot->slot_state, old_state, new_state)) goto retry; /* * We own the slot now. No one else can join. * Set the end LSN. */ WT_STAT_CONN_INCR(session, log_slot_closes); if (WT_LOG_SLOT_DONE(new_state)) *releasep = true; slot->slot_end_lsn = slot->slot_start_lsn; /* * A thread setting the unbuffered flag sets the unbuffered size after * setting the flag. There could be a delay between a thread setting * the flag, a thread closing the slot, and the original thread setting * that value. If the state is unbuffered, wait for the unbuffered * size to be set. */ #ifdef HAVE_DIAGNOSTIC count = 0; time_start = __wt_clock(session); #endif if (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state)) { while (slot->slot_unbuffered == 0) { WT_STAT_CONN_INCR(session, log_slot_close_unbuf); __wt_yield(); #ifdef HAVE_DIAGNOSTIC ++count; if (count > WT_MILLION) { time_stop = __wt_clock(session); if (WT_CLOCKDIFF_SEC( time_stop, time_start) > 10) { __wt_errx(session, "SLOT_CLOSE: Slot %" PRIu32 " Timeout unbuffered, state 0x%" PRIx64 " unbuffered %" PRId64, (uint32_t)(slot - &log->slot_pool[0]), (uint64_t)slot->slot_state, slot->slot_unbuffered); __log_slot_dump(session); __wt_abort(session); } count = 0; } #endif } } end_offset = WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered; slot->slot_end_lsn.l.offset += (uint32_t)end_offset; WT_STAT_CONN_INCRV(session, log_slot_consolidated, end_offset); /* * XXX Would like to change so one piece of code advances the LSN. */ log->alloc_lsn = slot->slot_end_lsn; WT_ASSERT(session, log->alloc_lsn.l.file >= log->write_lsn.l.file); return (0); }
/* * __log_slot_new -- * Find a free slot and switch it as the new active slot. * Must be called holding the slot lock. */ static int __log_slot_new(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *slot; int32_t i, pool_i; #ifdef HAVE_DIAGNOSTIC uint64_t time_start, time_stop; int count; #endif WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); conn = S2C(session); log = conn->log; /* * Although this function is single threaded, multiple threads could * be trying to set a new active slot sequentially. If we find an * active slot that is valid, return. */ if ((slot = log->active_slot) != NULL && WT_LOG_SLOT_OPEN(slot->slot_state)) return (0); #ifdef HAVE_DIAGNOSTIC count = 0; time_start = __wt_clock(session); #endif /* * Keep trying until we can find a free slot. */ for (;;) { /* * Rotate among the slots to lessen collisions. */ WT_RET(WT_SESSION_CHECK_PANIC(session)); for (i = 0, pool_i = log->pool_index; i < WT_SLOT_POOL; i++, pool_i++) { if (pool_i >= WT_SLOT_POOL) pool_i = 0; slot = &log->slot_pool[pool_i]; if (slot->slot_state == WT_LOG_SLOT_FREE) { /* * Acquire our starting position in the * log file. Assume the full buffer size. */ WT_RET(__wt_log_acquire(session, log->slot_buf_size, slot)); /* * We have a new, initialized slot to use. * Set it as the active slot. */ log->active_slot = slot; log->pool_index = pool_i; return (0); } } /* * If we didn't find any free slots signal the worker thread. */ WT_STAT_CONN_INCR(session, log_slot_no_free_slots); __wt_cond_signal(session, conn->log_wrlsn_cond); __wt_yield(); #ifdef HAVE_DIAGNOSTIC ++count; if (count > WT_MILLION) { time_stop = __wt_clock(session); if (WT_CLOCKDIFF_SEC(time_stop, time_start) > 10) { __wt_errx(session, "SLOT_NEW: Timeout free slot"); __log_slot_dump(session); __wt_abort(session); } count = 0; } #endif } /* NOTREACHED */ }