/* * __wt_checkpoint_signal -- * Signal the checkpoint thread if sufficient log has been written. * Return 1 if this signals the checkpoint thread, 0 otherwise. */ int __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize) { WT_CONNECTION_IMPL *conn; conn = S2C(session); WT_ASSERT(session, WT_CKPT_LOGSIZE(conn)); if (logsize >= conn->ckpt_logsize && !conn->ckpt_signalled) { WT_RET(__wt_cond_signal(session, conn->ckpt_cond)); conn->ckpt_signalled = 1; } return (0); }
/* * __wt_lsm_compact -- * Compact an LSM tree called via __wt_schema_worker. */ int __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) { WT_DECL_RET; WT_LSM_TREE *lsm_tree; uint64_t last_merge_progressing; time_t begin, end; /* * This function is applied to all matching sources: ignore anything * that is not an LSM tree. */ if (!WT_PREFIX_MATCH(name, "lsm:")) return (0); /* Tell __wt_schema_worker not to look inside the LSM tree. */ *skip = 1; WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree)); if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE) || lsm_tree->merge_threads == 0) WT_RET_MSG(session, EINVAL, "LSM compaction requires active merge threads"); WT_RET(__wt_seconds(session, &begin)); F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); /* Wake up the merge threads. */ WT_RET(__wt_cond_signal(session, lsm_tree->work_cond)); /* Now wait for merge activity to stop. */ do { last_merge_progressing = lsm_tree->merge_progressing; __wt_sleep(1, 0); WT_RET(__wt_seconds(session, &end)); if (session->compact->max_time > 0 && session->compact->max_time < (uint64_t)(end - begin)) WT_ERR(ETIMEDOUT); } while (lsm_tree->merge_progressing != last_merge_progressing && lsm_tree->nchunks > 1); err: F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING); return (ret); }
/* * __thread_group_shrink -- * Decrease the number of running threads in the group, and free any * memory associated with slots larger than the new count. */ static int __thread_group_shrink(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, uint32_t new_count) { WT_DECL_RET; WT_SESSION *wt_session; WT_THREAD *thread; uint32_t current_slot; WT_ASSERT(session, __wt_rwlock_islocked(session, group->lock)); for (current_slot = group->alloc; current_slot > new_count; ) { /* * The offset value is a counter not an array index, * so adjust it before finding the last thread in the group. */ thread = group->threads[--current_slot]; if (thread == NULL) continue; /* Wake threads to ensure they notice the state change */ if (thread->tid != 0) { __wt_verbose(session, WT_VERB_THREAD_GROUP, "Stopping utility thread: %p:%" PRIu32, (void *)group, thread->id); F_CLR(thread, WT_THREAD_RUN); __wt_cond_signal(session, group->wait_cond); WT_TRET(__wt_thread_join(session, thread->tid)); thread->tid = 0; } if (thread->session != NULL) { wt_session = (WT_SESSION *)thread->session; WT_TRET(wt_session->close(wt_session, NULL)); thread->session = NULL; } __wt_free(session, thread); group->threads[current_slot] = NULL; } /* Update the thread group state to match our changes */ group->current_threads = current_slot; return (ret); }
/* * __conn_reconfigure -- * WT_CONNECTION->reconfigure method. */ static int __conn_reconfigure(WT_CONNECTION *wt_conn, const char *config) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; /* * Special version of cfg that doesn't include the default config: used * to limit changes to values that the application sets explicitly. * Note that any function using this value has to be prepared to handle * not-found as a valid option return. */ const char *raw_cfg[] = { config, NULL }; conn = (WT_CONNECTION_IMPL *)wt_conn; CONNECTION_API_CALL(conn, session, reconfigure, config, cfg); /* Turning on statistics clears any existing values. */ if ((ret = __wt_config_gets(session, raw_cfg, "statistics", &cval)) == 0) { conn->statistics = cval.val == 0 ? 0 : 1; if (conn->statistics) __wt_stat_clear_connection_stats(&conn->stats); } WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_conn_cache_pool_config(session, cfg)); WT_ERR(__wt_cache_config(conn, raw_cfg)); WT_ERR(__conn_verbose_config(session, raw_cfg)); /* Wake up the cache pool server so any changes are noticed. */ if (F_ISSET(conn, WT_CONN_CACHE_POOL)) WT_ERR(__wt_cond_signal( session, __wt_process.cache_pool->cache_pool_cond)); err: API_END(session); return (ret); }
/* * __wt_checkpoint_server_destroy -- * Destroy the checkpoint server thread. */ int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION *wt_session; conn = S2C(session); F_CLR(conn, WT_CONN_SERVER_CHECKPOINT); if (conn->ckpt_tid_set) { WT_TRET(__wt_cond_signal(session, conn->ckpt_cond)); WT_TRET(__wt_thread_join(session, conn->ckpt_tid)); conn->ckpt_tid_set = 0; } WT_TRET(__wt_cond_destroy(session, &conn->ckpt_cond)); __wt_free(session, conn->ckpt_config); /* Close the server thread's session. */ if (conn->ckpt_session != NULL) { wt_session = &conn->ckpt_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); } /* * Ensure checkpoint settings are cleared - so that reconfigure doesn't * get confused. */ conn->ckpt_session = NULL; conn->ckpt_tid_set = 0; conn->ckpt_cond = NULL; conn->ckpt_config = NULL; conn->ckpt_usecs = 0; return (ret); }
/* * __log_wrlsn_server -- * The log wrlsn server thread. */ static WT_THREAD_RET __log_wrlsn_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL]; WT_LOGSLOT *slot; WT_SESSION_IMPL *session; size_t written_i; uint32_t i, save_i; int yield; session = arg; conn = S2C(session); log = conn->log; yield = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * No need to use the log_slot_lock because the slot pool * is statically allocated and any slot in the * WT_LOG_SLOT_WRITTEN state is exclusively ours for now. */ i = 0; written_i = 0; /* * Walk the array once saving any slots that are in the * WT_LOG_SLOT_WRITTEN state. */ while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; if (slot->slot_state != WT_LOG_SLOT_WRITTEN) continue; written[written_i].slot_index = save_i; written[written_i++].lsn = slot->slot_release_lsn; } /* * If we found any written slots process them. We sort them * based on the release LSN, and then look for them in order. */ if (written_i > 0) { yield = 0; WT_INSERTION_SORT(written, written_i, WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT); /* * We know the written array is sorted by LSN. Go * through them either advancing write_lsn or stop * as soon as one is not in order. */ for (i = 0; i < written_i; i++) { if (WT_LOG_CMP(&log->write_lsn, &written[i].lsn) != 0) break; /* * If we get here we have a slot to process. * Advance the LSN and process the slot. */ slot = &log->slot_pool[written[i].slot_index]; WT_ASSERT(session, WT_LOG_CMP(&written[i].lsn, &slot->slot_release_lsn) == 0); log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; WT_ERR(__wt_cond_signal(session, log->log_write_cond)); WT_STAT_FAST_CONN_INCR(session, log_write_lsn); /* * Signal the close thread if needed. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) WT_ERR(__wt_cond_signal(session, conn->log_file_cond)); WT_ERR(__wt_log_slot_free(session, slot)); } } /* * If we saw a later write, we always want to yield because * we know something is in progress. */ if (yield++ < 1000) __wt_yield(); else /* Wait until the next event. */ WT_ERR(__wt_cond_wait(session, conn->log_wrlsn_cond, 100000)); } if (0) err: __wt_err(session, ret, "log wrlsn server error"); return (WT_THREAD_RET_VALUE); }
/* * __log_file_server -- * The log file server thread. This worker thread manages * log file operations such as closing and syncing. */ static WT_THREAD_RET __log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN close_end_lsn, close_lsn, min_lsn; WT_SESSION_IMPL *session; int locked; session = arg; conn = S2C(session); log = conn->log; locked = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ if ((close_fh = log->log_close_fh) != NULL && (ret = __wt_log_extract_lognum(session, close_fh->name, &close_lsn.file)) == 0 && close_lsn.file < log->write_lsn.file) { /* * We've copied the file handle, clear out the one in * log structure to allow it to be set again. */ log->log_close_fh = NULL; /* * Set the close_end_lsn to the LSN immediately after * ours. That is, the beginning of the next log file. * We need to know the LSN file number of our own close * in case earlier calls are still in progress and the * next one to move the sync_lsn into the next file for * later syncs. */ close_lsn.offset = 0; close_end_lsn = close_lsn; close_end_lsn.file++; WT_ERR(__wt_fsync(session, close_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; WT_ERR(__wt_close(session, &close_fh)); WT_ASSERT(session, WT_LOG_CMP(&close_end_lsn, &log->sync_lsn) >= 0); log->sync_lsn = close_end_lsn; WT_ERR(__wt_cond_signal(session, log->log_sync_cond)); locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } /* * If a later thread asked for a background sync, do it now. */ if (WT_LOG_CMP(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* * The sync LSN we asked for better be smaller than * the current written LSN. */ WT_ASSERT(session, WT_LOG_CMP(&log->bg_sync_lsn, &min_lsn) <= 0); WT_ERR(__wt_fsync(session, log->log_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; /* * The sync LSN could have advanced while we were * writing to disk. */ if (WT_LOG_CMP(&log->sync_lsn, &min_lsn) <= 0) { log->sync_lsn = min_lsn; WT_ERR(__wt_cond_signal( session, log->log_sync_cond)); } locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } /* Wait until the next event. */ WT_ERR(__wt_cond_wait( session, conn->log_file_cond, WT_MILLION)); } if (0) { err: __wt_err(session, ret, "log close server error"); } if (locked) __wt_spin_unlock(session, &log->log_sync_lock); return (WT_THREAD_RET_VALUE); }
/* * __log_release -- * Release a log slot. */ static int __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN sync_lsn; size_t write_size; WT_DECL_SPINLOCK_ID(id); /* Must appear last */ conn = S2C(session); log = conn->log; /* * If we're going to have to close our log file, make a local copy * of the file handle structure. */ close_fh = NULL; if (F_ISSET(slot, SLOT_CLOSEFH)) { close_fh = log->log_close_fh; log->log_close_fh = NULL; F_CLR(slot, SLOT_CLOSEFH); } /* Write the buffered records */ if (F_ISSET(slot, SLOT_BUFFERED)) { write_size = (size_t) (slot->slot_end_lsn.offset - slot->slot_start_offset); WT_ERR(__wt_write(session, slot->slot_fh, slot->slot_start_offset, write_size, slot->slot_buf.mem)); } /* * Wait for earlier groups to finish, otherwise there could be holes * in the log file. */ while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) __wt_yield(); log->write_lsn = slot->slot_end_lsn; /* * Try to consolidate calls to fsync to wait less. Acquire a spin lock * so that threads finishing writing to the log will wait while the * current fsync completes and advance log->write_lsn. */ while (F_ISSET(slot, SLOT_SYNC) && LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) { (void)__wt_cond_wait( session, log->log_sync_cond, 10000); continue; } /* * Record the current end of log after we grabbed the lock. * That is how far our fsync call with guarantee. */ sync_lsn = log->write_lsn; if (LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { WT_STAT_FAST_CONN_INCR(session, log_sync); ret = __wt_fsync(session, log->log_fh); if (ret == 0) { F_CLR(slot, SLOT_SYNC); log->sync_lsn = sync_lsn; ret = __wt_cond_signal( session, log->log_sync_cond); } } __wt_spin_unlock(session, &log->log_sync_lock); WT_ERR(ret); } if (F_ISSET(slot, SLOT_BUF_GROW)) { WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); F_CLR(slot, SLOT_BUF_GROW); WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, slot->slot_buf.memsize); WT_ERR(__wt_buf_grow(session, &slot->slot_buf, slot->slot_buf.memsize * 2)); } /* * If we have a file to close, close it now. */ if (close_fh) WT_ERR(__wt_close(session, close_fh)); err: if (ret != 0 && slot->slot_error == 0) slot->slot_error = ret; return (ret); }
/* * __wt_log_wrlsn -- * Process written log slots and attempt to coalesce them if the LSNs * are contiguous. Returns 1 if slots were freed, 0 if no slots were * freed in the progress arg. Must be called with the log slot lock held. */ int __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL]; WT_LOGSLOT *coalescing, *slot; size_t written_i; uint32_t i, save_i; conn = S2C(session); log = conn->log; coalescing = NULL; written_i = 0; i = 0; if (free_i != NULL) *free_i = WT_SLOT_POOL; /* * Walk the array once saving any slots that are in the * WT_LOG_SLOT_WRITTEN state. */ while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; if (free_i != NULL && *free_i == WT_SLOT_POOL && slot->slot_state == WT_LOG_SLOT_FREE) *free_i = save_i; if (slot->slot_state != WT_LOG_SLOT_WRITTEN) continue; written[written_i].slot_index = save_i; written[written_i++].lsn = slot->slot_release_lsn; } /* * If we found any written slots process them. We sort them * based on the release LSN, and then look for them in order. */ if (written_i > 0) { /* * If wanted, reset the yield variable to indicate that we * have found written slots. */ if (yield != NULL) *yield = 0; WT_INSERTION_SORT(written, written_i, WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT); /* * We know the written array is sorted by LSN. Go * through them either advancing write_lsn or coalesce * contiguous ranges of written slots. */ for (i = 0; i < written_i; i++) { slot = &log->slot_pool[written[i].slot_index]; if (coalescing != NULL) { if (WT_LOG_CMP(&coalescing->slot_end_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; } /* * If we get here we have a slot to coalesce * and free. */ coalescing->slot_end_lsn = slot->slot_end_lsn; WT_STAT_FAST_CONN_INCR( session, log_slot_coalesced); /* * Copy the flag for later closing. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) F_SET(coalescing, WT_SLOT_CLOSEFH); } else { /* * If this written slot is not the next LSN, * try to start coalescing with later slots. */ if (WT_LOG_CMP( &log->write_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; } /* * If we get here we have a slot to process. * Advance the LSN and process the slot. */ WT_ASSERT(session, WT_LOG_CMP(&written[i].lsn, &slot->slot_release_lsn) == 0); log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; WT_RET(__wt_cond_signal( session, log->log_write_cond)); WT_STAT_FAST_CONN_INCR(session, log_write_lsn); /* * Signal the close thread if needed. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) WT_RET(__wt_cond_signal( session, conn->log_file_cond)); } WT_RET(__wt_log_slot_free(session, slot)); if (free_i != NULL && *free_i == WT_SLOT_POOL && slot->slot_state == WT_LOG_SLOT_FREE) *free_i = save_i; } } return (0); }
/* * __wt_logmgr_open -- * Start the log service threads. */ int __wt_logmgr_open(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; conn = S2C(session); /* If no log thread services are configured, we're done. */ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); /* * Start the log close thread. It is not configurable. * If logging is enabled, this thread runs. */ WT_RET(__wt_open_internal_session( conn, "log-close-server", 0, 0, &conn->log_file_session)); WT_RET(__wt_cond_alloc(conn->log_file_session, "log close server", 0, &conn->log_file_cond)); /* * Start the log file close thread. */ WT_RET(__wt_thread_create(conn->log_file_session, &conn->log_file_tid, __log_file_server, conn->log_file_session)); conn->log_file_tid_set = 1; /* * Start the log write LSN thread. It is not configurable. * If logging is enabled, this thread runs. */ WT_RET(__wt_open_internal_session( conn, "log-wrlsn-server", 0, 0, &conn->log_wrlsn_session)); WT_RET(__wt_cond_alloc(conn->log_wrlsn_session, "log write lsn server", 0, &conn->log_wrlsn_cond)); WT_RET(__wt_thread_create(conn->log_wrlsn_session, &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session)); conn->log_wrlsn_tid_set = 1; /* If no log thread services are configured, we're done. */ if (!FLD_ISSET(conn->log_flags, (WT_CONN_LOG_ARCHIVE | WT_CONN_LOG_PREALLOC))) return (0); /* * If a log server thread exists, the user may have reconfigured * archiving or pre-allocation. Signal the thread. Otherwise the * user wants archiving and/or allocation and we need to start up * the thread. */ if (conn->log_session != NULL) { WT_ASSERT(session, conn->log_cond != NULL); WT_ASSERT(session, conn->log_tid_set != 0); WT_RET(__wt_cond_signal(session, conn->log_cond)); } else { /* The log server gets its own session. */ WT_RET(__wt_open_internal_session( conn, "log-server", 0, 0, &conn->log_session)); WT_RET(__wt_cond_alloc(conn->log_session, "log server", 0, &conn->log_cond)); /* * Start the thread. */ WT_RET(__wt_thread_create(conn->log_session, &conn->log_tid, __log_server, conn->log_session)); conn->log_tid_set = 1; } return (0); }
/* * __log_slot_new -- * Find a free slot and switch it as the new active slot. * Must be called holding the slot lock. */ static int __log_slot_new(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *slot; int32_t i, pool_i; #ifdef HAVE_DIAGNOSTIC uint64_t time_start, time_stop; int count; #endif WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); conn = S2C(session); log = conn->log; /* * Although this function is single threaded, multiple threads could * be trying to set a new active slot sequentially. If we find an * active slot that is valid, return. */ if ((slot = log->active_slot) != NULL && WT_LOG_SLOT_OPEN(slot->slot_state)) return (0); #ifdef HAVE_DIAGNOSTIC count = 0; time_start = __wt_clock(session); #endif /* * Keep trying until we can find a free slot. */ for (;;) { /* * Rotate among the slots to lessen collisions. */ WT_RET(WT_SESSION_CHECK_PANIC(session)); for (i = 0, pool_i = log->pool_index; i < WT_SLOT_POOL; i++, pool_i++) { if (pool_i >= WT_SLOT_POOL) pool_i = 0; slot = &log->slot_pool[pool_i]; if (slot->slot_state == WT_LOG_SLOT_FREE) { /* * Acquire our starting position in the * log file. Assume the full buffer size. */ WT_RET(__wt_log_acquire(session, log->slot_buf_size, slot)); /* * We have a new, initialized slot to use. * Set it as the active slot. */ log->active_slot = slot; log->pool_index = pool_i; return (0); } } /* * If we didn't find any free slots signal the worker thread. */ WT_STAT_CONN_INCR(session, log_slot_no_free_slots); __wt_cond_signal(session, conn->log_wrlsn_cond); __wt_yield(); #ifdef HAVE_DIAGNOSTIC ++count; if (count > WT_MILLION) { time_stop = __wt_clock(session); if (WT_CLOCKDIFF_SEC(time_stop, time_start) > 10) { __wt_errx(session, "SLOT_NEW: Timeout free slot"); __log_slot_dump(session); __wt_abort(session); } count = 0; } #endif } /* NOTREACHED */ }
/* * __logmgr_config -- * Parse and setup the logging server options. */ static int __logmgr_config( WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool reconfig) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; bool enabled; /* * A note on reconfiguration: the standard "is this configuration string * allowed" checks should fail if reconfiguration has invalid strings, * for example, "log=(enabled)", or "statistics_log=(path=XXX)", because * the connection reconfiguration method doesn't allow those strings. * Additionally, the base configuration values during reconfiguration * are the currently configured values (so we don't revert to default * values when repeatedly reconfiguring), and configuration processing * of a currently set value should not change the currently set value. * * In this code path, log server reconfiguration does not stop/restart * the log server, so there's no point in re-evaluating configuration * strings that cannot be reconfigured, risking bugs in configuration * setup, and depending on evaluation of currently set values to always * result in the currently set value. Skip tests for any configuration * strings which don't make sense during reconfiguration, but don't * worry about error reporting because it should never happen. */ conn = S2C(session); WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); enabled = cval.val != 0; /* * If we're reconfiguring, enabled must match the already * existing setting. * * If it is off and the user it turning it on, or it is on * and the user is turning it off, return an error. * * See above: should never happen. */ if (reconfig && ((enabled && !FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) || (!enabled && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)))) WT_RET_MSG(session, EINVAL, "log manager reconfigure: enabled mismatch with existing " "setting"); /* Logging is incompatible with in-memory */ if (enabled) { WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval)); if (cval.val != 0) WT_RET_MSG(session, EINVAL, "In-memory configuration incompatible with " "log=(enabled=true)"); } *runp = enabled; /* * Setup a log path and compression even if logging is disabled in case * we are going to print a log. Only do this on creation. Once a * compressor or log path are set they cannot be changed. * * See above: should never happen. */ if (!reconfig) { conn->log_compressor = NULL; WT_RET(__wt_config_gets_none( session, cfg, "log.compressor", &cval)); WT_RET(__wt_compressor_config( session, &cval, &conn->log_compressor)); WT_RET(__wt_config_gets(session, cfg, "log.path", &cval)); WT_RET(__wt_strndup( session, cval.str, cval.len, &conn->log_path)); } /* We are done if logging isn't enabled. */ if (!*runp) return (0); WT_RET(__wt_config_gets(session, cfg, "log.archive", &cval)); if (cval.val != 0) FLD_SET(conn->log_flags, WT_CONN_LOG_ARCHIVE); /* * The file size cannot be reconfigured. The amount of memory allocated * to the log slots may be based on the log file size at creation and we * don't want to re-allocate that memory while running. * * See above: should never happen. */ if (!reconfig) { WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval)); conn->log_file_max = (wt_off_t)cval.val; WT_STAT_CONN_SET(session, log_max_filesize, conn->log_file_max); } /* * If pre-allocation is configured, set the initial number to a few. * We'll adapt as load dictates. */ WT_RET(__wt_config_gets(session, cfg, "log.prealloc", &cval)); if (cval.val != 0) conn->log_prealloc = 1; /* * Note it's meaningless to reconfigure this value during runtime, it * only matters on create before recovery runs. * * See above: should never happen. */ if (!reconfig) { WT_RET(__wt_config_gets_def( session, cfg, "log.recover", 0, &cval)); if (WT_STRING_MATCH("error", cval.str, cval.len)) FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR); } WT_RET(__wt_config_gets(session, cfg, "log.zero_fill", &cval)); if (cval.val != 0) { if (F_ISSET(conn, WT_CONN_READONLY)) WT_RET_MSG(session, EINVAL, "Read-only configuration incompatible with " "zero-filling log files"); FLD_SET(conn->log_flags, WT_CONN_LOG_ZERO_FILL); } WT_RET(__logmgr_sync_cfg(session, cfg)); if (conn->log_cond != NULL) __wt_cond_signal(session, conn->log_cond); return (0); }
/* * __wt_logmgr_open -- * Start the log service threads. */ int __wt_logmgr_open(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; uint32_t session_flags; conn = S2C(session); /* If no log thread services are configured, we're done. */ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); F_SET(conn, WT_CONN_SERVER_LOG); /* * Start the log close thread. It is not configurable. * If logging is enabled, this thread runs. */ session_flags = WT_SESSION_NO_DATA_HANDLES; WT_RET(__wt_open_internal_session(conn, "log-close-server", false, session_flags, &conn->log_file_session)); WT_RET(__wt_cond_alloc( conn->log_file_session, "log close server", &conn->log_file_cond)); /* * Start the log file close thread. */ WT_RET(__wt_thread_create(conn->log_file_session, &conn->log_file_tid, __log_file_server, conn->log_file_session)); conn->log_file_tid_set = true; /* * Start the log write LSN thread. It is not configurable. * If logging is enabled, this thread runs. */ WT_RET(__wt_open_internal_session(conn, "log-wrlsn-server", false, session_flags, &conn->log_wrlsn_session)); WT_RET(__wt_cond_auto_alloc(conn->log_wrlsn_session, "log write lsn server", 10000, WT_MILLION, &conn->log_wrlsn_cond)); WT_RET(__wt_thread_create(conn->log_wrlsn_session, &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session)); conn->log_wrlsn_tid_set = true; /* * If a log server thread exists, the user may have reconfigured * archiving or pre-allocation. Signal the thread. Otherwise the * user wants archiving and/or allocation and we need to start up * the thread. */ if (conn->log_session != NULL) { WT_ASSERT(session, conn->log_cond != NULL); WT_ASSERT(session, conn->log_tid_set == true); __wt_cond_signal(session, conn->log_cond); } else { /* The log server gets its own session. */ WT_RET(__wt_open_internal_session(conn, "log-server", false, session_flags, &conn->log_session)); WT_RET(__wt_cond_auto_alloc(conn->log_session, "log server", 50000, WT_MILLION, &conn->log_cond)); /* * Start the thread. */ WT_RET(__wt_thread_create(conn->log_session, &conn->log_tid, __log_server, conn->log_session)); conn->log_tid_set = true; } return (0); }
/* * __lsm_tree_close -- * Close an LSM tree structure. */ static int __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_SESSION *wt_session; WT_SESSION_IMPL *s; uint32_t i; if (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { F_CLR(lsm_tree, WT_LSM_TREE_WORKING); /* * Signal all threads to wake them up, then wait for them to * exit. * * !!! * If we have the schema lock, have the LSM worker sessions * inherit the flag before we do anything. The thread may * already be waiting for the schema lock, but the loop in the * WT_WITH_SCHEMA_LOCK macro takes care of that. */ if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) for (i = 0; i < lsm_tree->merge_threads; i++) { if ((s = lsm_tree->worker_sessions[i]) == NULL) continue; if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) s->skip_schema_lock = 1; WT_TRET(__wt_cond_signal( session, lsm_tree->work_cond)); WT_TRET(__wt_thread_join( session, lsm_tree->worker_tids[i])); } if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) lsm_tree->ckpt_session->skip_schema_lock = 1; WT_TRET(__wt_cond_signal(session, lsm_tree->work_cond)); WT_TRET(__wt_thread_join(session, lsm_tree->ckpt_tid)); } /* * Close the worker thread sessions. Do this in the main thread to * avoid deadlocks. */ for (i = 0; i < lsm_tree->merge_threads; i++) { if ((s = lsm_tree->worker_sessions[i]) == NULL) continue; lsm_tree->worker_sessions[i] = NULL; wt_session = &s->iface; WT_TRET(wt_session->close(wt_session, NULL)); } if (lsm_tree->ckpt_session != NULL) { wt_session = &lsm_tree->ckpt_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); } if (ret != 0) { __wt_err(session, ret, "shutdown error while cleaning up LSM"); (void)__wt_panic(session); } return (ret); }
/* * __clsm_open_cursors -- * Open cursors for the current set of files. */ static int __clsm_open_cursors( WT_CURSOR_LSM *clsm, int update, u_int start_chunk, uint32_t start_id) { WT_CURSOR *c, **cp, *primary; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; const char *checkpoint, *ckpt_cfg[3]; uint64_t saved_gen; u_int i, nchunks, ngood, nupdates; int locked; c = &clsm->iface; session = (WT_SESSION_IMPL *)c->session; txn = &session->txn; lsm_tree = clsm->lsm_tree; chunk = NULL; ckpt_cfg[0] = WT_CONFIG_BASE(session, session_open_cursor); ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw"; ckpt_cfg[2] = NULL; /* Copy the key, so we don't lose the cursor position. */ if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key)) WT_RET(__wt_buf_set( session, &c->key, c->key.data, c->key.size)); F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); if (update) { if (txn->isolation == TXN_ISO_SNAPSHOT) F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT); } else F_SET(clsm, WT_CLSM_OPEN_READ); WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; /* * If there is no in-memory chunk in the tree for an update operation, * create one. * * !!! * It is exceeding unlikely that we get here at all, but if we were to * switch chunks in this thread and our transaction roll back, it would * leave the metadata inconsistent. Signal for the LSM worker thread * to create the chunk instead to avoid the issue. */ if (update && (lsm_tree->nchunks == 0 || (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) == NULL || F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))) { /* Release our lock because switch will get a write lock. */ F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); WT_ERR(__wt_cond_signal(session, lsm_tree->work_cond)); /* * Give the worker thread a chance to run before locking the * tree again -- we will loop in __clsm_enter until there is an * in-memory chunk in the tree. */ __wt_sleep(0, 1000); WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; } F_SET(session, WT_SESSION_NO_CACHE_CHECK); /* Merge cursors have already figured out how many chunks they need. */ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { nchunks = clsm->nchunks; ngood = 0; /* * We may have raced with another merge completing. Check that * we're starting at the right offset in the chunk array. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) { for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) { chunk = lsm_tree->chunk[start_chunk]; if (chunk->id == start_id) break; } /* We have to find the start chunk: merge locked it. */ WT_ASSERT(session, start_chunk < lsm_tree->nchunks); } WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); } else { nchunks = lsm_tree->nchunks; /* * If we are only opening the cursor for updates, only open the * primary chunk, plus any other chunks that might be required * to detect snapshot isolation conflicts. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) WT_ERR(__wt_realloc_def(session, &clsm->txnid_alloc, nchunks, &clsm->txnid_max)); if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) ngood = nupdates = 0; else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { /* * Keep going until all updates in the next * chunk are globally visible. Copy the maximum * transaction IDs into the cursor as we go. */ for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) { chunk = lsm_tree->chunk[ngood - 1]; clsm->txnid_max[ngood - 1] = chunk->txnid_max; if (__wt_txn_visible_all( session, chunk->txnid_max)) break; } } else { nupdates = 1; ngood = nchunks - 1; } /* Check how many cursors are already open. */ for (cp = clsm->cursors + ngood; ngood < clsm->nchunks && ngood < nchunks; cp++, ngood++) { chunk = lsm_tree->chunk[ngood]; /* If the cursor isn't open yet, we're done. */ if (*cp == NULL) break; /* Easy case: the URIs don't match. */ if (strcmp((*cp)->uri, chunk->uri) != 0) break; /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) break; /* Make sure the Bloom config matches. */ if (clsm->blooms[ngood] == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) break; } /* Spurious generation bump? */ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) { clsm->dsk_gen = lsm_tree->dsk_gen; goto err; } /* * Close any cursors we no longer need. If the cursor is a * pure update cursor, close everything -- we usually only need * a single chunk open in that case and we haven't walked all * of the other slots in the loop above. * * Drop the LSM tree lock while we do this: if the cache is * full, we may block while closing a cursor. Save the * generation number and retry if it has changed under us. */ if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0) ngood = 0; if (clsm->cursors != NULL && ngood < clsm->nchunks) { saved_gen = lsm_tree->dsk_gen; locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); WT_ERR(__clsm_close_cursors( clsm, ngood, clsm->nchunks)); WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; if (lsm_tree->dsk_gen != saved_gen) goto retry; } /* Detach from our old primary. */ clsm->primary_chunk = NULL; clsm->current = NULL; } WT_ERR(__wt_realloc_def(session, &clsm->bloom_alloc, nchunks, &clsm->blooms)); WT_ERR(__wt_realloc_def(session, &clsm->cursor_alloc, nchunks, &clsm->cursors)); clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) clsm->txnid_max[i] = chunk->txnid_max; /* * Read from the checkpoint if the file has been written. * Once all cursors switch, the in-memory tree can be evicted. */ WT_ASSERT(session, *cp == NULL); ret = __wt_open_cursor(session, chunk->uri, c, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? ckpt_cfg : NULL, cp); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { ret = __wt_open_cursor( session, chunk->uri, c, NULL, cp); if (ret == 0) chunk->empty = 1; } WT_ERR(ret); if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, c, &clsm->blooms[i])); /* Child cursors always use overwrite and raw mode. */ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); } /* The last chunk is our new primary. */ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { clsm->primary_chunk = chunk; primary = clsm->cursors[clsm->nchunks - 1]; WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)(primary))->btree, __wt_btree_evictable(session, 0)); } clsm->dsk_gen = lsm_tree->dsk_gen; err: F_CLR(session, WT_SESSION_NO_CACHE_CHECK); #ifdef HAVE_DIAGNOSTIC /* Check that all cursors are open as expected. */ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Make sure the cursor is open. */ WT_ASSERT(session, *cp != NULL); /* Easy case: the URIs should match. */ WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? checkpoint != NULL : checkpoint == NULL); /* Make sure the Bloom config matches. */ WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ? clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); } } #endif if (locked) WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); return (ret); }
/* * __log_file_server -- * The log file server thread. This worker thread manages * log file operations such as closing and syncing. */ static WT_THREAD_RET __log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN close_end_lsn, min_lsn; WT_SESSION_IMPL *session; uint32_t filenum; int locked; session = arg; conn = S2C(session); log = conn->log; locked = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ if ((close_fh = log->log_close_fh) != NULL) { WT_ERR(__wt_log_extract_lognum(session, close_fh->name, &filenum)); /* * We update the close file handle before updating the * close LSN when changing files. It is possible we * could see mismatched settings. If we do, yield * until it is set. This should rarely happen. */ while (log->log_close_lsn.file < filenum) __wt_yield(); if (__wt_log_cmp( &log->write_lsn, &log->log_close_lsn) >= 0) { /* * We've copied the file handle, clear out the * one in the log structure to allow it to be * set again. Copy the LSN before clearing * the file handle. * Use a barrier to make sure the compiler does * not reorder the following two statements. */ close_end_lsn = log->log_close_lsn; WT_FULL_BARRIER(); log->log_close_fh = NULL; /* * Set the close_end_lsn to the LSN immediately * after ours. That is, the beginning of the * next log file. We need to know the LSN * file number of our own close in case earlier * calls are still in progress and the next one * to move the sync_lsn into the next file for * later syncs. */ close_end_lsn.file++; close_end_lsn.offset = 0; WT_ERR(__wt_fsync(session, close_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; WT_ERR(__wt_close(session, &close_fh)); WT_ASSERT(session, __wt_log_cmp( &close_end_lsn, &log->sync_lsn) >= 0); log->sync_lsn = close_end_lsn; WT_ERR(__wt_cond_signal( session, log->log_sync_cond)); locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } } /* * If a later thread asked for a background sync, do it now. */ if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* * We have to wait until the LSN we asked for is * written. If it isn't signal the wrlsn thread * to get it written. */ if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) { WT_ERR(__wt_fsync(session, log->log_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; /* * The sync LSN could have advanced while we * were writing to disk. */ if (__wt_log_cmp( &log->sync_lsn, &min_lsn) <= 0) { log->sync_lsn = min_lsn; WT_ERR(__wt_cond_signal( session, log->log_sync_cond)); } locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } else { WT_ERR(__wt_cond_signal( session, conn->log_wrlsn_cond)); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn * thread a chance to run and try again in * this case. */ __wt_yield(); continue; } } /* Wait until the next event. */ WT_ERR(__wt_cond_wait( session, conn->log_file_cond, WT_MILLION)); } if (0) { err: __wt_err(session, ret, "log close server error"); } if (locked) __wt_spin_unlock(session, &log->log_sync_lock); return (WT_THREAD_RET_VALUE); }
/* * __wt_log_wrlsn -- * Process written log slots and attempt to coalesce them if the LSNs * are contiguous. The purpose of this function is to advance the * write_lsn in LSN order after the buffer is written to the log file. */ int __wt_log_wrlsn(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL]; WT_LOGSLOT *coalescing, *slot; WT_LSN save_lsn; size_t written_i; uint32_t i, save_i; conn = S2C(session); log = conn->log; __wt_spin_lock(session, &log->log_writelsn_lock); restart: coalescing = NULL; WT_INIT_LSN(&save_lsn); written_i = 0; i = 0; /* * Walk the array once saving any slots that are in the * WT_LOG_SLOT_WRITTEN state. */ while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; /* * XXX - During debugging I saw slot 0 become orphaned. * I believe it is fixed, but check for now. * This assertion should catch that. */ if (slot->slot_state == 0) WT_ASSERT(session, slot->slot_release_lsn.file >= log->write_lsn.file); if (slot->slot_state != WT_LOG_SLOT_WRITTEN) continue; written[written_i].slot_index = save_i; written[written_i++].lsn = slot->slot_release_lsn; } /* * If we found any written slots process them. We sort them * based on the release LSN, and then look for them in order. */ if (written_i > 0) { WT_INSERTION_SORT(written, written_i, WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT); /* * We know the written array is sorted by LSN. Go * through them either advancing write_lsn or coalesce * contiguous ranges of written slots. */ for (i = 0; i < written_i; i++) { slot = &log->slot_pool[written[i].slot_index]; /* * The log server thread pushes out slots periodically. * Sometimes they are empty slots. If we find an * empty slot, where empty means the start and end LSN * are the same, free it and continue. */ if (__wt_log_cmp(&slot->slot_start_lsn, &slot->slot_release_lsn) == 0 && __wt_log_cmp(&slot->slot_start_lsn, &slot->slot_end_lsn) == 0) { __wt_log_slot_free(session, slot); continue; } if (coalescing != NULL) { /* * If the write_lsn changed, we may be able to * process slots. Try again. */ if (__wt_log_cmp( &log->write_lsn, &save_lsn) != 0) goto restart; if (__wt_log_cmp(&coalescing->slot_end_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; } /* * If we get here we have a slot to coalesce * and free. */ coalescing->slot_last_offset = slot->slot_last_offset; coalescing->slot_end_lsn = slot->slot_end_lsn; WT_STAT_FAST_CONN_INCR( session, log_slot_coalesced); /* * Copy the flag for later closing. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) F_SET(coalescing, WT_SLOT_CLOSEFH); } else { /* * If this written slot is not the next LSN, * try to start coalescing with later slots. * A synchronous write may update write_lsn * so save the last one we saw to check when * coalescing slots. */ save_lsn = log->write_lsn; if (__wt_log_cmp( &log->write_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; } /* * If we get here we have a slot to process. * Advance the LSN and process the slot. */ WT_ASSERT(session, __wt_log_cmp(&written[i].lsn, &slot->slot_release_lsn) == 0); if (slot->slot_start_lsn.offset != slot->slot_last_offset) slot->slot_start_lsn.offset = slot->slot_last_offset; log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; WT_ERR(__wt_cond_signal( session, log->log_write_cond)); WT_STAT_FAST_CONN_INCR(session, log_write_lsn); /* * Signal the close thread if needed. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) WT_ERR(__wt_cond_signal( session, conn->log_file_cond)); } __wt_log_slot_free(session, slot); } } err: __wt_spin_unlock(session, &log->log_writelsn_lock); return (ret); }
/* * __log_file_server -- * The log file server thread. This worker thread manages * log file operations such as closing and syncing. */ static WT_THREAD_RET __log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN close_end_lsn, min_lsn; WT_SESSION_IMPL *session; uint32_t filenum; bool locked; session = arg; conn = S2C(session); log = conn->log; locked = false; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ if ((close_fh = log->log_close_fh) != NULL) { WT_ERR(__wt_log_extract_lognum(session, close_fh->name, &filenum)); /* * We update the close file handle before updating the * close LSN when changing files. It is possible we * could see mismatched settings. If we do, yield * until it is set. This should rarely happen. */ while (log->log_close_lsn.l.file < filenum) __wt_yield(); if (__wt_log_cmp( &log->write_lsn, &log->log_close_lsn) >= 0) { /* * We've copied the file handle, clear out the * one in the log structure to allow it to be * set again. Copy the LSN before clearing * the file handle. * Use a barrier to make sure the compiler does * not reorder the following two statements. */ close_end_lsn = log->log_close_lsn; WT_FULL_BARRIER(); log->log_close_fh = NULL; /* * Set the close_end_lsn to the LSN immediately * after ours. That is, the beginning of the * next log file. We need to know the LSN * file number of our own close in case earlier * calls are still in progress and the next one * to move the sync_lsn into the next file for * later syncs. */ WT_ERR(__wt_fsync(session, close_fh, true)); /* * We want to have the file size reflect actual * data with minimal pre-allocated zeroed space. * We can't truncate the file during hot backup, * or the underlying file system may not support * truncate: both are OK, it's just more work * during cursor traversal. */ if (!conn->hot_backup) { __wt_readlock( session, conn->hot_backup_lock); if (!conn->hot_backup) WT_ERR_ERROR_OK( __wt_ftruncate(session, close_fh, close_end_lsn.l.offset), ENOTSUP); __wt_readunlock( session, conn->hot_backup_lock); } WT_SET_LSN(&close_end_lsn, close_end_lsn.l.file + 1, 0); __wt_spin_lock(session, &log->log_sync_lock); locked = true; WT_ERR(__wt_close(session, &close_fh)); WT_ASSERT(session, __wt_log_cmp( &close_end_lsn, &log->sync_lsn) >= 0); log->sync_lsn = close_end_lsn; __wt_cond_signal(session, log->log_sync_cond); locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } } /* * If a later thread asked for a background sync, do it now. */ if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* * We have to wait until the LSN we asked for is * written. If it isn't signal the wrlsn thread * to get it written. * * We also have to wait for the written LSN and the * sync LSN to be in the same file so that we know we * have synchronized all earlier log files. */ if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) { /* * If the sync file is behind either the one * wanted for a background sync or the write LSN * has moved to another file continue to let * this worker thread process that older file * immediately. */ if ((log->sync_lsn.l.file < log->bg_sync_lsn.l.file) || (log->sync_lsn.l.file < min_lsn.l.file)) continue; WT_ERR(__wt_fsync(session, log->log_fh, true)); __wt_spin_lock(session, &log->log_sync_lock); locked = true; /* * The sync LSN could have advanced while we * were writing to disk. */ if (__wt_log_cmp( &log->sync_lsn, &min_lsn) <= 0) { WT_ASSERT(session, min_lsn.l.file == log->sync_lsn.l.file); log->sync_lsn = min_lsn; __wt_cond_signal( session, log->log_sync_cond); } locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } else { __wt_cond_auto_signal( session, conn->log_wrlsn_cond); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn * thread a chance to run and try again in * this case. */ __wt_yield(); continue; } } /* Wait until the next event. */ __wt_cond_wait(session, conn->log_file_cond, WT_MILLION / 10); } if (0) { err: __wt_err(session, ret, "log close server error"); } if (locked) __wt_spin_unlock(session, &log->log_sync_lock); return (WT_THREAD_RET_VALUE); }
/* * __wt_logmgr_destroy -- * Destroy the log archiving server thread and logging subsystem. */ int __wt_logmgr_destroy(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION *wt_session; conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) { /* * We always set up the log_path so printlog can work without * recovery. Therefore, always free it, even if logging isn't * on. */ __wt_free(session, conn->log_path); return (0); } if (conn->log_tid_set) { WT_TRET(__wt_cond_signal(session, conn->log_cond)); WT_TRET(__wt_thread_join(session, conn->log_tid)); conn->log_tid_set = 0; } if (conn->log_file_tid_set) { WT_TRET(__wt_cond_signal(session, conn->log_file_cond)); WT_TRET(__wt_thread_join(session, conn->log_file_tid)); conn->log_file_tid_set = 0; } if (conn->log_file_session != NULL) { wt_session = &conn->log_file_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); conn->log_file_session = NULL; } if (conn->log_wrlsn_tid_set) { WT_TRET(__wt_cond_signal(session, conn->log_wrlsn_cond)); WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid)); conn->log_wrlsn_tid_set = 0; } if (conn->log_wrlsn_session != NULL) { wt_session = &conn->log_wrlsn_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); conn->log_wrlsn_session = NULL; } WT_TRET(__wt_log_slot_destroy(session)); WT_TRET(__wt_log_close(session)); /* Close the server thread's session. */ if (conn->log_session != NULL) { wt_session = &conn->log_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); conn->log_session = NULL; } /* Destroy the condition variables now that all threads are stopped */ WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond)); WT_TRET(__wt_rwlock_destroy(session, &conn->log->log_archive_lock)); __wt_spin_destroy(session, &conn->log->log_lock); __wt_spin_destroy(session, &conn->log->log_slot_lock); __wt_spin_destroy(session, &conn->log->log_sync_lock); __wt_spin_destroy(session, &conn->log->log_writelsn_lock); __wt_free(session, conn->log_path); __wt_free(session, conn->log); return (ret); }
/* * __wt_meta_track_off -- * Turn off metadata operation tracking, unrolling on error. */ int __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll) { WT_DECL_RET; WT_META_TRACK *trk, *trk_orig; WT_SESSION_IMPL *ckpt_session; int saved_ret; bool did_drop; saved_ret = 0; WT_ASSERT(session, WT_META_TRACKING(session) && session->meta_track_nest > 0); trk_orig = session->meta_track; trk = session->meta_track_next; /* If it was a nested transaction, there is nothing to do. */ if (--session->meta_track_nest != 0) return (0); /* Turn off tracking for unroll. */ session->meta_track_next = session->meta_track_sub = NULL; /* * If there were no operations logged, skip unnecessary metadata * checkpoints. For example, this happens if attempting to create a * data source that already exists (or drop one that doesn't). */ if (trk == trk_orig) goto err; /* Unrolling doesn't require syncing the metadata. */ if (unroll) goto err; if (F_ISSET(session, WT_SESSION_SCHEMA_TXN)) { F_CLR(session, WT_SESSION_SCHEMA_TXN); #ifdef WT_ENABLE_SCHEMA_TXN WT_ERR(__wt_txn_commit(session, NULL)); __wt_errx(session, "TRACK: Commit internal schema txn"); #endif } /* * If we don't have the metadata cursor (e.g, we're in the process of * creating the metadata), we can't sync it. */ if (!need_sync || session->meta_cursor == NULL || F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) goto err; /* If we're logging, make sure the metadata update was flushed. */ if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED)) WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), ret = __wt_txn_checkpoint_log( session, false, WT_TXN_LOG_CKPT_SYNC, NULL)); else { WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); ckpt_session = S2C(session)->meta_ckpt_session; /* * If this operation is part of a running transaction, that * should be included in the checkpoint. */ ckpt_session->txn.id = session->txn.id; WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_METADATA)); WT_WITH_DHANDLE(ckpt_session, WT_SESSION_META_DHANDLE(session), WT_WITH_METADATA_LOCK(ckpt_session, ret = __wt_checkpoint(ckpt_session, NULL))); ckpt_session->txn.id = WT_TXN_NONE; if (ret == 0) WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), ret = __wt_checkpoint_sync(session, NULL)); } err: /* * Undo any tracked operations on failure. * Apply any tracked operations post-commit. */ did_drop = false; if (unroll || ret != 0) { saved_ret = ret; ret = 0; while (--trk >= trk_orig) { did_drop = did_drop || trk->op == WT_ST_DROP_COMMIT; WT_TRET(__meta_track_unroll(session, trk)); } } else for (; trk_orig < trk; trk_orig++) { did_drop = did_drop || trk_orig->op == WT_ST_DROP_COMMIT; WT_TRET(__meta_track_apply(session, trk_orig)); } if (F_ISSET(session, WT_SESSION_SCHEMA_TXN)) { F_CLR(session, WT_SESSION_SCHEMA_TXN); /* * We should have committed above unless we're unrolling, there * was an error or the operation was a noop. */ WT_ASSERT(session, unroll || saved_ret != 0 || session->txn.mod_count == 0); #ifdef WT_ENABLE_SCHEMA_TXN __wt_err(session, saved_ret, "TRACK: Abort internal schema txn"); WT_TRET(__wt_txn_rollback(session, NULL)); #endif } /* * Wake up the sweep thread: particularly for the in-memory * storage engine, we want to reclaim space immediately. */ if (did_drop && S2C(session)->sweep_cond != NULL) __wt_cond_signal(session, S2C(session)->sweep_cond); if (ret != 0) WT_PANIC_RET(session, ret, "failed to apply or unroll all tracked operations"); return (saved_ret == 0 ? 0 : saved_ret); }