/* * __wt_cond_signal -- * Signal a waiting thread. */ void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) { WT_DECL_RET; __wt_verbose(session, WT_VERB_MUTEX, "signal %s", cond->name); /* * Our callers often set flags to cause a thread to exit. Add a barrier * to ensure exit flags are seen by the sleeping threads, otherwise we * can wake up a thread, it immediately goes back to sleep, and we'll * hang. Use a full barrier (we may not write before waiting on thread * join). */ WT_FULL_BARRIER(); /* * Fast path if we are in (or can enter), a state where the next waiter * will return immediately as already signaled. */ if (cond->waiters == -1 || (cond->waiters == 0 && __wt_atomic_casi32(&cond->waiters, 0, -1))) return; EnterCriticalSection(&cond->mtx); WakeAllConditionVariable(&cond->cond); LeaveCriticalSection(&cond->mtx); }
/* * __wt_yield -- * Yield the thread of control. */ void __wt_yield(void) { /* * Yielding the processor isn't documented as a memory barrier, and it's * a reasonable expectation to have. There's no reason not to explicitly * include a barrier since we're giving up the CPU, and ensures callers * aren't ever surprised. */ WT_FULL_BARRIER(); SwitchToThread(); }
/* * __wt_las_set_written -- * Flag that the lookaside table has been written. */ void __wt_las_set_written(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; conn = S2C(session); if (!conn->las_written) { conn->las_written = true; /* * Push the flag: unnecessary, but from now page reads must deal * with lookaside table records, and we only do the write once. */ WT_FULL_BARRIER(); } }
/* * __wt_lsm_start_worker -- * Start the worker thread for an LSM tree. */ static int __lsm_tree_start_worker(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_CONNECTION *wt_conn; WT_LSM_WORKER_ARGS *wargs; WT_SESSION *wt_session; WT_SESSION_IMPL *s; uint32_t i; wt_conn = &S2C(session)->iface; WT_RET(wt_conn->open_session(wt_conn, NULL, NULL, &wt_session)); lsm_tree->ckpt_session = (WT_SESSION_IMPL *)wt_session; F_SET(lsm_tree->ckpt_session, WT_SESSION_INTERNAL); F_SET(lsm_tree, WT_LSM_TREE_WORKING); /* The new thread will rely on the WORKING value being visible. */ WT_FULL_BARRIER(); if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) for (i = 0; i < lsm_tree->merge_threads; i++) { WT_RET(wt_conn->open_session( wt_conn, NULL, NULL, &wt_session)); s = (WT_SESSION_IMPL *)wt_session; F_SET(s, WT_SESSION_INTERNAL); lsm_tree->worker_sessions[i] = s; WT_RET(__wt_calloc_def(session, 1, &wargs)); wargs->lsm_tree = lsm_tree; wargs->id = i; WT_RET(__wt_thread_create(session, &lsm_tree->worker_tids[i], __wt_lsm_merge_worker, wargs)); } if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_NEWEST)) { WT_RET(wt_conn->open_session(wt_conn, NULL, NULL, &wt_session)); lsm_tree->bloom_session = (WT_SESSION_IMPL *)wt_session; F_SET(lsm_tree->bloom_session, WT_SESSION_INTERNAL); WT_RET(__wt_thread_create(session, &lsm_tree->bloom_tid, __wt_lsm_bloom_worker, lsm_tree)); } WT_RET(__wt_thread_create(session, &lsm_tree->ckpt_tid, __wt_lsm_checkpoint_worker, lsm_tree)); return (0); }
/* * __lsm_tree_start_worker -- * Start the worker thread for an LSM tree. */ static int __lsm_tree_start_worker(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_CONNECTION *wt_conn; WT_LSM_WORKER_ARGS *wargs; WT_SESSION *wt_session; WT_SESSION_IMPL *s; uint32_t i; wt_conn = &S2C(session)->iface; /* * All the LSM worker threads do their operations on read-only files. * Use read-uncommitted isolation to avoid keeping updates in cache * unnecessarily. */ WT_RET(wt_conn->open_session( wt_conn, NULL, "isolation=read-uncommitted", &wt_session)); lsm_tree->ckpt_session = (WT_SESSION_IMPL *)wt_session; F_SET(lsm_tree->ckpt_session, WT_SESSION_INTERNAL); F_SET(lsm_tree, WT_LSM_TREE_WORKING); /* The new thread will rely on the WORKING value being visible. */ WT_FULL_BARRIER(); if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) for (i = 0; i < lsm_tree->merge_threads; i++) { WT_RET(wt_conn->open_session( wt_conn, NULL, "isolation=read-uncommitted", &wt_session)); s = (WT_SESSION_IMPL *)wt_session; F_SET(s, WT_SESSION_INTERNAL); lsm_tree->worker_sessions[i] = s; WT_RET(__wt_calloc_def(session, 1, &wargs)); wargs->lsm_tree = lsm_tree; wargs->id = i; WT_RET(__wt_thread_create(session, &lsm_tree->worker_tids[i], __wt_lsm_merge_worker, wargs)); } WT_RET(__wt_thread_create(session, &lsm_tree->ckpt_tid, __wt_lsm_checkpoint_worker, lsm_tree)); return (0); }
/* * __wt_thread_create -- * Create a new thread of control. */ int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) { /* * Creating a thread isn't a memory barrier, but WiredTiger commonly * sets flags and or state and then expects worker threads to start. * Include a barrier to ensure safety in those cases. */ WT_FULL_BARRIER(); /* Spawn a new thread of control. */ tidret->id = (HANDLE)_beginthreadex(NULL, 0, func, arg, 0, NULL); if (tidret->id != 0) { tidret->created = true; return (0); } WT_RET_MSG(session, __wt_errno(), "thread create: _beginthreadex"); }
/* * __wt_thread_join -- * Wait for a thread of control to exit. */ int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t *tid) { DWORD windows_error; /* Only attempt to join if thread was created successfully */ if (!tid->created) return (0); tid->created = false; /* * Joining a thread isn't a memory barrier, but WiredTiger commonly * sets flags and or state and then expects worker threads to halt. * Include a barrier to ensure safety in those cases. */ WT_FULL_BARRIER(); if ((windows_error = WaitForSingleObject(tid->id, INFINITE)) != WAIT_OBJECT_0) { if (windows_error == WAIT_FAILED) windows_error = __wt_getlasterror(); __wt_errx(session, "thread join: WaitForSingleObject: %s", __wt_formatmessage(session, windows_error)); /* If we fail to wait, we will leak handles, do not continue. */ return (WT_PANIC); } if (CloseHandle(tid->id) == 0) { windows_error = __wt_getlasterror(); __wt_errx(session, "thread join: CloseHandle: %s", __wt_formatmessage(session, windows_error)); return (__wt_map_windows_error(windows_error)); } return (0); }
/* * __wt_spin_lock_unregister_lock -- * Remove a lock from the connection's list. */ void __wt_spin_lock_unregister_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) { WT_CONNECTION_IMPL *conn; u_int i; conn = S2C(session); for (i = 0; i < WT_SPINLOCK_MAX; i++) if (conn->spinlock_list[i] == t) conn->spinlock_list[i] = NULL; /* * XXX * The statistics thread reads through this array, there's a possible * race: if that thread reads the pointer then goes to sleep, then we * free the spinlock, then the statistics thread wakes up, it can read * free'd memory. * * This is performance debugging code, so we're not fixing the race for * now, minimize the window. */ WT_FULL_BARRIER(); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, int syncop) { struct timespec end, start; WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, leaf_bytes; uint64_t internal_pages, leaf_pages; uint32_t flags; bool evict_reset; btree = S2BT(session); flags = WT_READ_CACHE | WT_READ_NO_GEN; walk = NULL; txn = &session->txn; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) WT_RET(__wt_epoch(session, &start)); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write the hottest pages: checkpoint will have * to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && __wt_txn_visible_all( session, page->modify->update_txn)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; case WT_SYNC_CHECKPOINT: /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * When internal pages are being reconciled by checkpoint their * child pages cannot disappear from underneath them or be split * into them, nor can underlying blocks be freed until the block * lists for the checkpoint are stable. Set the checkpointing * flag to block eviction of dirty pages until the checkpoint's * internal page pass is complete, then wait for any existing * eviction to complete. */ btree->checkpointing = 1; WT_FULL_BARRIER(); WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); if (evict_reset) __wt_evict_file_exclusive_off(session); /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { /* * If we have a page, and it was ever modified, track * the highest transaction ID in the tree. We do this * here because we want the value after reconciling * dirty pages. */ if (walk != NULL && walk->page != NULL && (mod = walk->page->modify) != NULL && WT_TXNID_LT(btree->rec_max_txn, mod->rec_max_txn)) btree->rec_max_txn = mod->rec_max_txn; WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); if (walk == NULL) break; page = walk->page; mod = page->modify; /* Skip clean pages. */ if (!__wt_page_is_modified(page)) continue; /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. * * We can skip dirty pages if: * (1) they are leaf pages; * (2) there is a snapshot transaction active (which * is the case in ordinary application checkpoints * but not all internal cases); and * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. * * Mark the tree dirty: the checkpoint marked it clean * and we can't skip future checkpoints until this page * is written. */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn) && mod->rec_result != WT_PM_REC_REWRITE) { __wt_page_modify_set(session, page); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } break; } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_ERR(__wt_epoch(session, &end)); WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 " bytes, %" PRIu64 " pages of internal\n\t" "Took: %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, WT_TIMEDIFF(end, start) / WT_MILLION)); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); if (txn->isolation == WT_ISO_READ_COMMITTED && session->ncursors == 0) __wt_txn_release_snapshot(session); if (btree->checkpointing) { /* * Update the checkpoint generation for this handle so visible * updates newer than the checkpoint can be evicted. * * This has to be published before eviction is enabled again, * so that eviction knows that the checkpoint has completed. */ WT_PUBLISH(btree->checkpoint_gen, S2C(session)->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. */ btree->checkpointing = 0; WT_FULL_BARRIER(); /* * If this tree was being skipped by the eviction server during * the checkpoint, clear the wait. */ btree->evict_walk_period = 0; /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ WT_TRET(__wt_evict_server_wake(session)); } __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES) WT_RET(btree->bm->sync(btree->bm, session, true)); return (ret); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { struct timespec end, start; WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_snap_min; uint32_t flags; conn = S2C(session); btree = S2BT(session); walk = NULL; txn = &session->txn; saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min; flags = WT_READ_CACHE | WT_READ_NO_GEN; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) WT_RET(__wt_epoch(session, &start)); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } /* * Save the oldest transaction ID we need to keep around. * Otherwise, in a busy system, we could be updating pages so * fast that write leaves never catches up. We deliberately * have no transaction running at this point that would keep * the oldest ID from moving forwards as we walk the tree. */ oldest_id = __wt_txn_oldest_id(session); flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write hot pages (defined as pages that have * been updated since the write phase leaves started): * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; case WT_SYNC_CHECKPOINT: /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make * schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. * We don't update the snapshot in between pages because (a) * the metadata shouldn't be that big, and (b) if we do ever */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * In the final checkpoint pass, child pages cannot be evicted * from underneath internal pages nor can underlying blocks be * freed until the checkpoint's block lists are stable. Also, * we cannot split child pages into parents unless we know the * final pass will write a consistent view of that namespace. * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE); WT_ERR(__wt_evict_file_exclusive_on(session)); __wt_evict_file_exclusive_off(session); WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING); /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* Skip clean pages. */ if (!__wt_page_is_modified(walk->page)) continue; /* * Take a local reference to the page modify structure * now that we know the page is dirty. It needs to be * done in this order otherwise the page modify * structure could have been created between taking the * reference and checking modified. */ page = walk->page; mod = page->modify; /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. * * We can skip dirty pages if: * (1) they are leaf pages; * (2) there is a snapshot transaction active (which * is the case in ordinary application checkpoints * but not all internal cases); and * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. * * Mark the tree dirty: the checkpoint marked it clean * and we can't skip future checkpoints until this page * is written. */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { __wt_page_modify_set(session, page); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: WT_ILLEGAL_VALUE_ERR(session); } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_ERR(__wt_epoch(session, &end)); WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 " bytes, %" PRIu64 " pages of internal\n\t" "Took: %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, WT_TIMEDIFF_MS(end, start))); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); /* * If we got a snapshot in order to write pages, and there was no * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && saved_snap_min == WT_TXN_NONE) __wt_txn_release_snapshot(session); if (btree->checkpointing != WT_CKPT_OFF) { /* * Update the checkpoint generation for this handle so visible * updates newer than the checkpoint can be evicted. * * This has to be published before eviction is enabled again, * so that eviction knows that the checkpoint has completed. */ WT_PUBLISH(btree->checkpoint_gen, conn->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. */ btree->checkpointing = WT_CKPT_OFF; WT_FULL_BARRIER(); /* * If this tree was being skipped by the eviction server during * the checkpoint, clear the wait. */ btree->evict_walk_period = 0; /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ WT_TRET(__wt_evict_server_wake(session)); } __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, true)); return (ret); }
/* * __wt_hazard_set -- * Set a hazard pointer. */ int __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_HAZARD *hp; int restarts = 0; btree = S2BT(session); conn = S2C(session); *busyp = false; /* If a file can never be evicted, hazard pointers aren't required. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) return (0); /* * Do the dance: * * The memory location which makes a page "real" is the WT_REF's state * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the * page eviction server. * * Add the WT_REF reference to the session's hazard list and flush the * write, then see if the page's state is still valid. If so, we can * use the page because the page eviction server will see our hazard * pointer before it discards the page (the eviction server sets the * state to WT_REF_LOCKED, then flushes memory and checks the hazard * pointers). * * For sessions with many active hazard pointers, skip most of the * active slots: there may be a free slot in there, but checking is * expensive. Most hazard pointers are released quickly: optimize * for that case. */ for (hp = session->hazard + session->nhazard;; ++hp) { /* * If we get to the end of the array, either: * 1. If we know there are free slots somewhere, and this is * the first time through, continue the search from the * start. Don't actually continue the loop because that * will skip the first slot. * 2. If we have searched all the way through and we have * allocated the maximum number of slots, give up. * 3. Allocate another increment of slots, up to the maximum. * The slot we are on should now be available. */ if (hp >= session->hazard + session->hazard_size) { if (session->nhazard < session->hazard_size && restarts++ == 0) hp = session->hazard; else if (session->hazard_size >= conn->hazard_max) break; else WT_PUBLISH(session->hazard_size, WT_MIN( session->hazard_size + WT_HAZARD_INCR, conn->hazard_max)); } if (hp->page != NULL) continue; hp->page = ref->page; #ifdef HAVE_DIAGNOSTIC hp->file = file; hp->line = line; #endif /* Publish the hazard pointer before reading page's state. */ WT_FULL_BARRIER(); /* * Check if the page state is still valid, where valid means a * state of WT_REF_MEM and the pointer is unchanged. (The * pointer can change, it means the page was evicted between * the time we set our hazard pointer and the publication. It * would theoretically be possible for the page to be evicted * and a different page read into the same memory, so the * pointer hasn't changed but the contents have. That's OK, we * found this page using the tree's key space, whatever page we * find here is the page for us to use.) */ if (ref->page == hp->page && ref->state == WT_REF_MEM) { ++session->nhazard; return (0); } /* * The page isn't available, it's being considered for eviction * (or being evicted, for all we know). If the eviction server * sees our hazard pointer before evicting the page, it will * return the page to use, no harm done, if it doesn't, it will * go ahead and complete the eviction. * * We don't bother publishing this update: the worst case is we * prevent some random page from being evicted. */ hp->page = NULL; *busyp = true; return (0); } __wt_errx(session, "session %p: hazard pointer table full", (void *)session); #ifdef HAVE_DIAGNOSTIC __hazard_dump(session); #endif return (ENOMEM); }
/* * __wt_statlog_dump_spinlock -- * Log the spin-lock statistics. */ int __wt_statlog_dump_spinlock(WT_CONNECTION_IMPL *conn, const char *tag) { WT_SPINLOCK *spin; WT_CONNECTION_STATS_SPINLOCK *p, *t; uint64_t block_manager, btree_page, ignore; u_int i, j; /* * Ignore rare acquisition of a spinlock using a base value of 10 per * second so we don't create graphs we don't care about. */ ignore = (uint64_t)(conn->stat_usecs / 1000000) * 10; /* Output the number of times each spinlock was acquired. */ block_manager = btree_page = 0; for (i = 0; i < WT_ELEMENTS(conn->spinlock_list); ++i) { if ((spin = conn->spinlock_list[i]) == NULL) continue; /* * There are two sets of spinlocks we aggregate, the btree page * locks and the block manager per-file locks. The reason is * the block manager locks grow with the number of files open * (and LSM and bloom filters can open a lot of files), and * there are 16 btree page locks and splitting them out has not * historically been that informative. */ if (strcmp(spin->name, "block manager") == 0) { block_manager += spin->counter; if (conn->stat_clear) spin->counter = 0; continue; } if (strcmp(spin->name, "btree page") == 0) { btree_page += spin->counter; if (conn->stat_clear) spin->counter = 0; continue; } WT_RET_TEST((fprintf(conn->stat_fp, "%s %" PRIu64 " %s spinlock %s: acquisitions\n", conn->stat_stamp, spin->counter <= ignore ? 0 : spin->counter, tag, spin->name) < 0), __wt_errno()); if (conn->stat_clear) spin->counter = 0; } WT_RET_TEST((fprintf(conn->stat_fp, "%s %" PRIu64 " %s spinlock %s: acquisitions\n", conn->stat_stamp, block_manager <= ignore ? 0 : block_manager, tag, "block manager") < 0), __wt_errno()); WT_RET_TEST((fprintf(conn->stat_fp, "%s %" PRIu64 " %s spinlock %s: acquisitions\n", conn->stat_stamp, btree_page <= ignore ? 0 : btree_page, tag, "btree page") < 0), __wt_errno()); /* * Output the number of times each location acquires its spinlock and * the blocking matrix. */ for (i = 0; i < WT_ELEMENTS(conn->spinlock_block); ++i) { p = &conn->spinlock_block[i]; if (p->name == NULL) continue; WT_RET_TEST((fprintf(conn->stat_fp, "%s %d %s spinlock %s acquired by %s(%d)\n", conn->stat_stamp, p->total <= ignore ? 0 : p->total, tag, p->name, p->file, p->line) < 0), __wt_errno()); if (conn->stat_clear) p->total = 0; for (j = 0; j < WT_ELEMENTS(conn->spinlock_block); ++j) { t = &conn->spinlock_block[j]; if (t->name == NULL) continue; WT_RET_TEST((fprintf(conn->stat_fp, "%s %d %s spinlock %s: %s(%d) blocked by %s(%d)\n", conn->stat_stamp, p->blocked[j] <= ignore ? 0 : p->blocked[j], tag, p->name, p->file, p->line, t->file, t->line) < 0), __wt_errno()); if (conn->stat_clear) p->blocked[j] = 0; } } WT_FULL_BARRIER(); /* Minimize the window. */ return (0); }
/* * __wt_connection_close -- * Close a connection handle. */ int __wt_connection_close(WT_CONNECTION_IMPL *conn) { WT_CONNECTION *wt_conn; WT_DECL_RET; WT_DLH *dlh; WT_SESSION_IMPL *s, *session; u_int i; wt_conn = &conn->iface; session = conn->default_session; /* Shut down transactions (wait for in-flight operations to complete. */ WT_TRET(__wt_txn_global_shutdown(session)); /* Shut down the subsystems, ensuring workers see the state change. */ F_SET(conn, WT_CONN_CLOSING); WT_FULL_BARRIER(); /* * Clear any pending async operations and shut down the async worker * threads and system before closing LSM. */ WT_TRET(__wt_async_flush(session)); WT_TRET(__wt_async_destroy(session)); /* * Shut down server threads other than the eviction server, which is * needed later to close btree handles. Some of these threads access * btree handles, so take care in ordering shutdown to make sure they * exit before files are closed. */ WT_TRET(__wt_lsm_manager_destroy(session)); /* * Once the async and LSM threads exit, we shouldn't be opening any * more files. */ F_SET(conn, WT_CONN_CLOSING_NO_MORE_OPENS); WT_FULL_BARRIER(); WT_TRET(__wt_checkpoint_server_destroy(session)); WT_TRET(__wt_statlog_destroy(session, true)); WT_TRET(__wt_sweep_destroy(session)); /* The eviction server is shut down last. */ WT_TRET(__wt_evict_destroy(session)); /* Shut down the lookaside table, after all eviction is complete. */ WT_TRET(__wt_las_destroy(session)); /* Close open data handles. */ WT_TRET(__wt_conn_dhandle_discard(session)); /* Shut down metadata tracking. */ WT_TRET(__wt_meta_track_destroy(session)); /* * Now that all data handles are closed, tell logging that a checkpoint * has completed then shut down the log manager (only after closing * data handles). The call to destroy the log manager is outside the * conditional because we allocate the log path so that printlog can * run without running logging or recovery. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE)) WT_TRET(__wt_txn_checkpoint_log( session, true, WT_TXN_LOG_CKPT_STOP, NULL)); WT_TRET(__wt_logmgr_destroy(session)); /* Free memory for collators, compressors, data sources. */ WT_TRET(__wt_conn_remove_collator(session)); WT_TRET(__wt_conn_remove_compressor(session)); WT_TRET(__wt_conn_remove_data_source(session)); WT_TRET(__wt_conn_remove_encryptor(session)); WT_TRET(__wt_conn_remove_extractor(session)); /* Disconnect from shared cache - must be before cache destroy. */ WT_TRET(__wt_conn_cache_pool_destroy(session)); /* Discard the cache. */ WT_TRET(__wt_cache_destroy(session)); /* Discard transaction state. */ __wt_txn_global_destroy(session); /* Close the lock file, opening up the database to other connections. */ if (conn->lock_fh != NULL) WT_TRET(__wt_close(session, &conn->lock_fh)); /* Close any file handles left open. */ WT_TRET(__wt_close_connection_close(session)); /* * Close the internal (default) session, and switch back to the dummy * session in case of any error messages from the remaining operations * while destroying the connection handle. */ if (session != &conn->dummy_session) { WT_TRET(session->iface.close(&session->iface, NULL)); session = conn->default_session = &conn->dummy_session; } /* * The session split stash, hazard information and handle arrays aren't * discarded during normal session close, they persist past the life of * the session. Discard them now. */ if (!F_ISSET(conn, WT_CONN_LEAK_MEMORY)) if ((s = conn->sessions) != NULL) for (i = 0; i < conn->session_size; ++s, ++i) { __wt_free(session, s->dhhash); __wt_stash_discard_all(session, s); __wt_free(session, s->hazard); } /* Destroy the file-system configuration. */ if (conn->file_system != NULL && conn->file_system->terminate != NULL) WT_TRET(conn->file_system->terminate( conn->file_system, (WT_SESSION *)session)); /* Close extensions, first calling any unload entry point. */ while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) { TAILQ_REMOVE(&conn->dlhqh, dlh, q); if (dlh->terminate != NULL) WT_TRET(dlh->terminate(wt_conn)); WT_TRET(__wt_dlclose(session, dlh)); } /* Destroy the handle. */ __wt_connection_destroy(conn); return (ret); }
/* * __wt_hazard_set -- * Set a hazard pointer. */ int __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_HAZARD *hp; int restarts = 0; btree = S2BT(session); *busyp = 0; /* If a file can never be evicted, hazard pointers aren't required. */ if (F_ISSET(btree, WT_BTREE_NO_HAZARD)) return (0); /* * Do the dance: * * The memory location which makes a page "real" is the WT_REF's state * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the * page eviction server. * * Add the WT_REF reference to the session's hazard list and flush the * write, then see if the page's state is still valid. If so, we can * use the page because the page eviction server will see our hazard * pointer before it discards the page (the eviction server sets the * state to WT_REF_LOCKED, then flushes memory and checks the hazard * pointers). * * For sessions with many active hazard pointers, skip most of the * active slots: there may be a free slot in there, but checking is * expensive. Most hazard pointers are released quickly: optimize * for that case. */ for (hp = session->hazard + session->nhazard;; ++hp) { /* Expand the number of hazard pointers if available.*/ if (hp >= session->hazard + session->hazard_size) { if (session->hazard_size >= S2C(session)->hazard_max) break; /* Restart the search. */ if (session->nhazard < session->hazard_size && restarts++ == 0) { hp = session->hazard; continue; } WT_PUBLISH(session->hazard_size, WT_MIN(session->hazard_size + WT_HAZARD_INCR, S2C(session)->hazard_max)); } if (hp->page != NULL) continue; hp->page = ref->page; #ifdef HAVE_DIAGNOSTIC hp->file = file; hp->line = line; #endif /* Publish the hazard pointer before reading page's state. */ WT_FULL_BARRIER(); /* * Check if the page state is still valid, where valid means a * state of WT_REF_MEM or WT_REF_EVICT_WALK and the pointer is * unchanged. (The pointer can change, it means the page was * evicted between the time we set our hazard pointer and the * publication. It would theoretically be possible for the * page to be evicted and a different page read into the same * memory, so the pointer hasn't changed but the contents have. * That's OK, we found this page using the tree's key space, * whatever page we find here is the page for us to use.) */ if (ref->page == hp->page && (ref->state == WT_REF_MEM || ref->state == WT_REF_EVICT_WALK)) { WT_VERBOSE_RET(session, hazard, "session %p hazard %p: set", session, ref->page); ++session->nhazard; return (0); } /* * The page isn't available, it's being considered for eviction * (or being evicted, for all we know). If the eviction server * sees our hazard pointer before evicting the page, it will * return the page to use, no harm done, if it doesn't, it will * go ahead and complete the eviction. * * We don't bother publishing this update: the worst case is we * prevent some random page from being evicted. */ hp->page = NULL; *busyp = 1; return (0); } __wt_errx(session, "session %p: hazard pointer table full", session); #ifdef HAVE_DIAGNOSTIC __hazard_dump(session); #endif return (ENOMEM); }
/* * __log_file_server -- * The log file server thread. This worker thread manages * log file operations such as closing and syncing. */ static WT_THREAD_RET __log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN close_end_lsn, min_lsn; WT_SESSION_IMPL *session; uint32_t filenum; bool locked; session = arg; conn = S2C(session); log = conn->log; locked = false; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ if ((close_fh = log->log_close_fh) != NULL) { WT_ERR(__wt_log_extract_lognum(session, close_fh->name, &filenum)); /* * We update the close file handle before updating the * close LSN when changing files. It is possible we * could see mismatched settings. If we do, yield * until it is set. This should rarely happen. */ while (log->log_close_lsn.l.file < filenum) __wt_yield(); if (__wt_log_cmp( &log->write_lsn, &log->log_close_lsn) >= 0) { /* * We've copied the file handle, clear out the * one in the log structure to allow it to be * set again. Copy the LSN before clearing * the file handle. * Use a barrier to make sure the compiler does * not reorder the following two statements. */ close_end_lsn = log->log_close_lsn; WT_FULL_BARRIER(); log->log_close_fh = NULL; /* * Set the close_end_lsn to the LSN immediately * after ours. That is, the beginning of the * next log file. We need to know the LSN * file number of our own close in case earlier * calls are still in progress and the next one * to move the sync_lsn into the next file for * later syncs. */ WT_ERR(__wt_fsync(session, close_fh, true)); /* * We want to have the file size reflect actual * data with minimal pre-allocated zeroed space. * We can't truncate the file during hot backup, * or the underlying file system may not support * truncate: both are OK, it's just more work * during cursor traversal. */ if (!conn->hot_backup) { __wt_readlock( session, conn->hot_backup_lock); if (!conn->hot_backup) WT_ERR_ERROR_OK( __wt_ftruncate(session, close_fh, close_end_lsn.l.offset), ENOTSUP); __wt_readunlock( session, conn->hot_backup_lock); } WT_SET_LSN(&close_end_lsn, close_end_lsn.l.file + 1, 0); __wt_spin_lock(session, &log->log_sync_lock); locked = true; WT_ERR(__wt_close(session, &close_fh)); WT_ASSERT(session, __wt_log_cmp( &close_end_lsn, &log->sync_lsn) >= 0); log->sync_lsn = close_end_lsn; __wt_cond_signal(session, log->log_sync_cond); locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } } /* * If a later thread asked for a background sync, do it now. */ if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* * We have to wait until the LSN we asked for is * written. If it isn't signal the wrlsn thread * to get it written. * * We also have to wait for the written LSN and the * sync LSN to be in the same file so that we know we * have synchronized all earlier log files. */ if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) { /* * If the sync file is behind either the one * wanted for a background sync or the write LSN * has moved to another file continue to let * this worker thread process that older file * immediately. */ if ((log->sync_lsn.l.file < log->bg_sync_lsn.l.file) || (log->sync_lsn.l.file < min_lsn.l.file)) continue; WT_ERR(__wt_fsync(session, log->log_fh, true)); __wt_spin_lock(session, &log->log_sync_lock); locked = true; /* * The sync LSN could have advanced while we * were writing to disk. */ if (__wt_log_cmp( &log->sync_lsn, &min_lsn) <= 0) { WT_ASSERT(session, min_lsn.l.file == log->sync_lsn.l.file); log->sync_lsn = min_lsn; __wt_cond_signal( session, log->log_sync_cond); } locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } else { __wt_cond_auto_signal( session, conn->log_wrlsn_cond); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn * thread a chance to run and try again in * this case. */ __wt_yield(); continue; } } /* Wait until the next event. */ __wt_cond_wait(session, conn->log_file_cond, WT_MILLION / 10); } if (0) { err: __wt_err(session, ret, "log close server error"); } if (locked) __wt_spin_unlock(session, &log->log_sync_lock); return (WT_THREAD_RET_VALUE); }
/* * __log_file_server -- * The log file server thread. This worker thread manages * log file operations such as closing and syncing. */ static WT_THREAD_RET __log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN close_end_lsn, min_lsn; WT_SESSION_IMPL *session; uint32_t filenum; int locked; session = arg; conn = S2C(session); log = conn->log; locked = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ if ((close_fh = log->log_close_fh) != NULL) { WT_ERR(__wt_log_extract_lognum(session, close_fh->name, &filenum)); /* * We update the close file handle before updating the * close LSN when changing files. It is possible we * could see mismatched settings. If we do, yield * until it is set. This should rarely happen. */ while (log->log_close_lsn.file < filenum) __wt_yield(); if (__wt_log_cmp( &log->write_lsn, &log->log_close_lsn) >= 0) { /* * We've copied the file handle, clear out the * one in the log structure to allow it to be * set again. Copy the LSN before clearing * the file handle. * Use a barrier to make sure the compiler does * not reorder the following two statements. */ close_end_lsn = log->log_close_lsn; WT_FULL_BARRIER(); log->log_close_fh = NULL; /* * Set the close_end_lsn to the LSN immediately * after ours. That is, the beginning of the * next log file. We need to know the LSN * file number of our own close in case earlier * calls are still in progress and the next one * to move the sync_lsn into the next file for * later syncs. */ close_end_lsn.file++; close_end_lsn.offset = 0; WT_ERR(__wt_fsync(session, close_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; WT_ERR(__wt_close(session, &close_fh)); WT_ASSERT(session, __wt_log_cmp( &close_end_lsn, &log->sync_lsn) >= 0); log->sync_lsn = close_end_lsn; WT_ERR(__wt_cond_signal( session, log->log_sync_cond)); locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } } /* * If a later thread asked for a background sync, do it now. */ if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* * We have to wait until the LSN we asked for is * written. If it isn't signal the wrlsn thread * to get it written. */ if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) { WT_ERR(__wt_fsync(session, log->log_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; /* * The sync LSN could have advanced while we * were writing to disk. */ if (__wt_log_cmp( &log->sync_lsn, &min_lsn) <= 0) { log->sync_lsn = min_lsn; WT_ERR(__wt_cond_signal( session, log->log_sync_cond)); } locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } else { WT_ERR(__wt_cond_signal( session, conn->log_wrlsn_cond)); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn * thread a chance to run and try again in * this case. */ __wt_yield(); continue; } } /* Wait until the next event. */ WT_ERR(__wt_cond_wait( session, conn->log_file_cond, WT_MILLION)); } if (0) { err: __wt_err(session, ret, "log close server error"); } if (locked) __wt_spin_unlock(session, &log->log_sync_lock); return (WT_THREAD_RET_VALUE); }
/* * __wt_compact_page_skip -- * Return if compaction requires we read this page. */ int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; size_t addr_size; u_int type; const uint8_t *addr; /* * Skip deleted pages, rewriting them doesn't seem useful; in a better * world we'd write the parent to delete the page. */ if (ref->state == WT_REF_DELETED) { *skipp = true; return (0); } *skipp = false; /* Default to reading */ /* * If the page is in-memory, we want to look at it (it may have been * modified and written, and the current location is the interesting * one in terms of compaction, not the original location). * * This test could be combined with the next one, but this is a cheap * test and the next one is expensive. */ if (ref->state != WT_REF_DISK) return (0); /* * There's nothing to prevent the WT_REF state from changing underfoot, * which can change its address. For example, the WT_REF address might * reference an on-page cell, and page eviction can free that memory. * Lock the WT_REF so we can look at its address. */ if (!__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* * The page is on disk, so there had better be an address; assert that * fact, test at run-time to avoid the core dump. * * Internal pages must be read to walk the tree; ask the block-manager * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite * won't help. */ __wt_ref_info(ref, &addr, &addr_size, &type); WT_ASSERT(session, addr != NULL); if (addr != NULL && type != WT_CELL_ADDR_INT) { bm = S2BT(session)->bm; ret = bm->compact_page_skip( bm, session, addr, addr_size, skipp); } /* * Reset the WT_REF state and push the change. The full-barrier isn't * necessary, but it's better to keep pages in circulation than not. */ ref->state = WT_REF_DISK; WT_FULL_BARRIER(); return (ret); }