/* * __logmgr_sync_cfg -- * Interpret the transaction_sync config. */ static int __logmgr_sync_cfg(WT_SESSION_IMPL *session, const char **cfg) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; conn = S2C(session); WT_RET( __wt_config_gets(session, cfg, "transaction_sync.enabled", &cval)); if (cval.val) FLD_SET(conn->txn_logsync, WT_LOG_SYNC_ENABLED); else FLD_CLR(conn->txn_logsync, WT_LOG_SYNC_ENABLED); WT_RET( __wt_config_gets(session, cfg, "transaction_sync.method", &cval)); FLD_CLR(conn->txn_logsync, WT_LOG_DSYNC | WT_LOG_FLUSH | WT_LOG_FSYNC); if (WT_STRING_MATCH("dsync", cval.str, cval.len)) FLD_SET(conn->txn_logsync, WT_LOG_DSYNC | WT_LOG_FLUSH); else if (WT_STRING_MATCH("fsync", cval.str, cval.len)) FLD_SET(conn->txn_logsync, WT_LOG_FSYNC); else if (WT_STRING_MATCH("none", cval.str, cval.len)) FLD_SET(conn->txn_logsync, WT_LOG_FLUSH); return (0); }
/* * __logmgr_config -- * Parse and setup the logging server options. */ static int __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, int *runp) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; conn = S2C(session); /* * The logging configuration is off by default. */ WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); *runp = cval.val != 0; /* * Setup a log path, compression and encryption even if logging is * disabled in case we are going to print a log. */ conn->log_compressor = NULL; WT_RET(__wt_config_gets_none(session, cfg, "log.compressor", &cval)); WT_RET(__wt_compressor_config(session, &cval, &conn->log_compressor)); WT_RET(__wt_config_gets(session, cfg, "log.path", &cval)); WT_RET(__wt_strndup(session, cval.str, cval.len, &conn->log_path)); /* We are done if logging isn't enabled. */ if (*runp == 0) return (0); WT_RET(__wt_config_gets(session, cfg, "log.archive", &cval)); if (cval.val != 0) FLD_SET(conn->log_flags, WT_CONN_LOG_ARCHIVE); WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval)); conn->log_file_max = (wt_off_t)cval.val; WT_STAT_FAST_CONN_SET(session, log_max_filesize, conn->log_file_max); WT_RET(__wt_config_gets(session, cfg, "log.prealloc", &cval)); /* * If pre-allocation is configured, set the initial number to one. * We'll adapt as load dictates. */ if (cval.val != 0) { FLD_SET(conn->log_flags, WT_CONN_LOG_PREALLOC); conn->log_prealloc = 1; } WT_RET(__wt_config_gets_def(session, cfg, "log.recover", 0, &cval)); if (cval.len != 0 && WT_STRING_MATCH("error", cval.str, cval.len)) FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR); WT_RET(__logmgr_sync_cfg(session, cfg)); return (0); }
/* * __wt_logmgr_create -- * Initialize the log subsystem (before running recovery). */ int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_LOG *log; bool run; conn = S2C(session); /* Handle configuration. */ WT_RET(__logmgr_config(session, cfg, &run, false)); /* If logging is not configured, we're done. */ if (!run) return (0); FLD_SET(conn->log_flags, WT_CONN_LOG_ENABLED); /* * Logging is on, allocate the WT_LOG structure and open the log file. */ WT_RET(__wt_calloc_one(session, &conn->log)); log = conn->log; WT_RET(__wt_spin_init(session, &log->log_lock, "log")); WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot")); WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync")); WT_RET(__wt_spin_init(session, &log->log_writelsn_lock, "log write LSN")); WT_RET(__wt_rwlock_alloc(session, &log->log_archive_lock, "log archive lock")); if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG)) log->allocsize = WT_MAX((uint32_t)conn->buffer_alignment, WT_LOG_ALIGN); else log->allocsize = WT_LOG_ALIGN; WT_INIT_LSN(&log->alloc_lsn); WT_INIT_LSN(&log->ckpt_lsn); WT_INIT_LSN(&log->first_lsn); WT_INIT_LSN(&log->sync_lsn); /* * We only use file numbers for directory sync, so this needs to * initialized to zero. */ WT_ZERO_LSN(&log->sync_dir_lsn); WT_INIT_LSN(&log->trunc_lsn); WT_INIT_LSN(&log->write_lsn); WT_INIT_LSN(&log->write_start_lsn); log->fileid = 0; WT_RET(__wt_cond_alloc( session, "log sync", false, &log->log_sync_cond)); WT_RET(__wt_cond_alloc( session, "log write", false, &log->log_write_cond)); WT_RET(__wt_log_open(session)); WT_RET(__wt_log_slot_init(session)); return (0); }
/* * __conn_verbose_config -- * Set verbose configuration. */ static int __conn_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval, sval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; static struct { const char *name; uint32_t flag; } *ft, verbtypes[] = { { "block", WT_VERB_block }, { "shared_cache",WT_VERB_shared_cache }, { "ckpt", WT_VERB_ckpt }, { "evict", WT_VERB_evict }, { "evictserver",WT_VERB_evictserver }, { "fileops", WT_VERB_fileops }, { "hazard", WT_VERB_hazard }, { "lsm", WT_VERB_lsm }, { "mutex", WT_VERB_mutex }, { "read", WT_VERB_read }, { "reconcile", WT_VERB_reconcile }, { "salvage", WT_VERB_salvage }, { "verify", WT_VERB_verify }, { "write", WT_VERB_write }, { NULL, 0 } }; conn = S2C(session); if ((ret = __wt_config_gets(session, cfg, "verbose", &cval)) != 0) return (ret == WT_NOTFOUND ? 0 : ret); for (ft = verbtypes; ft->name != NULL; ft++) { if ((ret = __wt_config_subgets( session, &cval, ft->name, &sval)) == 0 && sval.val != 0) FLD_SET(conn->verbose, ft->flag); else FLD_CLR(conn->verbose, ft->flag); WT_RET_NOTFOUND_OK(ret); } return (0); }
/* * __wt_txn_recover -- * Run recovery. */ int __wt_txn_recover(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *metac; WT_DECL_RET; WT_RECOVERY r; struct WT_RECOVERY_FILE *metafile; char *config; bool eviction_started, needs_rec, was_backup; conn = S2C(session); WT_CLEAR(r); WT_INIT_LSN(&r.ckpt_lsn); eviction_started = false; was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP); /* We need a real session for recovery. */ WT_RET(__wt_open_internal_session(conn, "txn-recover", false, WT_SESSION_NO_LOGGING, &session)); r.session = session; F_SET(conn, WT_CONN_RECOVERING); WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac)); metafile = &r.files[WT_METAFILE_ID]; metafile->c = metac; /* * If no log was found (including if logging is disabled), or if the * last checkpoint was done with logging disabled, recovery should not * run. Scan the metadata to figure out the largest file ID. */ if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) { WT_ERR(__recovery_file_scan(&r)); conn->next_file_id = r.max_fileid; goto done; } /* * First, do a pass through the log to recover the metadata, and * establish the last checkpoint LSN. Skip this when opening a hot * backup: we already have the correct metadata in that case. */ if (!was_backup) { r.metadata_only = true; /* * If this is a read-only connection, check if the checkpoint * LSN in the metadata file is up to date, indicating a clean * shutdown. */ if (F_ISSET(conn, WT_CONN_READONLY)) { WT_ERR(__wt_log_needs_recovery( session, &metafile->ckpt_lsn, &needs_rec)); if (needs_rec) WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery"); } if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); else { /* * Start at the last checkpoint LSN referenced in the * metadata. If we see the end of a checkpoint while * scanning, we will change the full scan to start from * there. */ r.ckpt_lsn = metafile->ckpt_lsn; ret = __wt_log_scan(session, &metafile->ckpt_lsn, 0, __txn_log_recover, &r); if (ret == ENOENT) ret = 0; WT_ERR(ret); } } /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); /* * We no longer need the metadata cursor: close it to avoid pinning any * resources that could block eviction during recovery. */ r.files[0].c = NULL; WT_ERR(metac->close(metac)); /* * Now, recover all the files apart from the metadata. * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = false; WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, "Main recovery loop: starting at %" PRIu32 "/%" PRIu32, r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset)); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); /* * Check if the database was shut down cleanly. If not * return an error if the user does not want automatic * recovery. */ if (needs_rec && (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) || F_ISSET(conn, WT_CONN_READONLY))) { if (F_ISSET(conn, WT_CONN_READONLY)) WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery"); WT_ERR(WT_RUN_RECOVERY); } if (F_ISSET(conn, WT_CONN_READONLY)) goto done; /* * Recovery can touch more data than fits in cache, so it relies on * regular eviction to manage paging. Start eviction threads for * recovery without LAS cursors. */ WT_ERR(__wt_evict_create(session)); eviction_started = true; /* * Always run recovery even if it was a clean shutdown only if * this is not a read-only connection. * We can consider skipping it in the future. */ if (WT_IS_INIT_LSN(&r.ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); else { ret = __wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r); if (ret == ENOENT) ret = 0; WT_ERR(ret); } conn->next_file_id = r.max_fileid; /* * If recovery ran successfully forcibly log a checkpoint so the next * open is fast and keep the metadata up to date with the checkpoint * LSN and archiving. */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); done: FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE); err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); if (ret != 0) __wt_err(session, ret, "Recovery failed"); /* * Destroy the eviction threads that were started in support of * recovery. They will be restarted once the lookaside table is * created. */ if (eviction_started) WT_TRET(__wt_evict_destroy(session)); WT_TRET(session->iface.close(&session->iface, NULL)); F_CLR(conn, WT_CONN_RECOVERING); return (ret); }
/* * __wt_lsm_tree_create -- * Create an LSM tree structure for the given name. */ int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_TREE *lsm_tree; const char *cfg[] = API_CONF_DEFAULTS(session, create, config); const char *tmpconfig; /* If the tree is open, it already exists. */ if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); /* * If the tree has metadata, it already exists. * * !!! * Use a local variable: we don't care what the existing configuration * is, but we don't want to overwrite the real config. */ if (__wt_metadata_read(session, uri, &tmpconfig) == 0) { __wt_free(session, tmpconfig); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); if (WT_STRING_MATCH("r", cval.str, cval.len)) WT_RET_MSG(session, EINVAL, "LSM trees cannot be configured as column stores"); WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); WT_RET(__lsm_tree_set_name(session, lsm_tree, uri)); WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->key_format)); WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->value_format)); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom", &cval)); FLD_SET(lsm_tree->bloom, (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_newest", &cval)); if (cval.val != 0) FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_NEWEST); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_oldest", &cval)); if (cval.val != 0) FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_NEWEST) || FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))) WT_ERR_MSG(session, EINVAL, "Bloom filters can only be created on newest and oldest " "chunks if bloom filters are enabled"); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_config", &cval)); if (cval.type == ITEM_STRUCT) { cval.str++; cval.len -= 2; } WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->bloom_config)); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_bit_count", &cval)); lsm_tree->bloom_bit_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_hash_count", &cval)); lsm_tree->bloom_hash_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm_chunk_size", &cval)); lsm_tree->chunk_size = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm_merge_max", &cval)); lsm_tree->merge_max = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm_merge_threads", &cval)); lsm_tree->merge_threads = (uint32_t)cval.val; /* Sanity check that api_data.py is in sync with lsm.h */ WT_ASSERT(session, lsm_tree->merge_threads <= WT_LSM_MAX_WORKERS); WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "%s,key_format=u,value_format=u", config)); lsm_tree->file_config = __wt_buf_steal(session, buf, NULL); /* Create the first chunk and flush the metadata. */ WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); /* Discard our partially populated handle. */ ret = __lsm_tree_discard(session, lsm_tree); lsm_tree = NULL; /* * Open our new tree and add it to the handle cache. Don't discard on * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ if (ret == 0) ret = __lsm_tree_open(session, uri, &lsm_tree); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); if (0) { err: WT_TRET(__lsm_tree_discard(session, lsm_tree)); } __wt_scr_free(&buf); return (ret); }
/* * __logmgr_config -- * Parse and setup the logging server options. */ static int __logmgr_config( WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool reconfig) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; bool enabled; conn = S2C(session); WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); enabled = cval.val != 0; /* * If we're reconfiguring, enabled must match the already * existing setting. * * If it is off and the user it turning it on, or it is on * and the user is turning it off, return an error. */ if (reconfig && ((enabled && !FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) || (!enabled && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)))) return (EINVAL); /* Logging is incompatible with in-memory */ if (enabled) { WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval)); if (cval.val != 0) WT_RET_MSG(session, EINVAL, "In memory configuration incompatible with " "log=(enabled=true)"); } *runp = enabled; /* * Setup a log path and compression even if logging is disabled in case * we are going to print a log. Only do this on creation. Once a * compressor or log path are set they cannot be changed. */ if (!reconfig) { conn->log_compressor = NULL; WT_RET(__wt_config_gets_none( session, cfg, "log.compressor", &cval)); WT_RET(__wt_compressor_config( session, &cval, &conn->log_compressor)); WT_RET(__wt_config_gets(session, cfg, "log.path", &cval)); WT_RET(__wt_strndup( session, cval.str, cval.len, &conn->log_path)); } /* We are done if logging isn't enabled. */ if (!*runp) return (0); WT_RET(__wt_config_gets(session, cfg, "log.archive", &cval)); if (cval.val != 0) FLD_SET(conn->log_flags, WT_CONN_LOG_ARCHIVE); if (!reconfig) { /* * Ignore if the user tries to change the file size. The * amount of memory allocated to the log slots may be based * on the log file size at creation and we don't want to * re-allocate that memory while running. */ WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval)); conn->log_file_max = (wt_off_t)cval.val; WT_STAT_FAST_CONN_SET(session, log_max_filesize, conn->log_file_max); } /* * If pre-allocation is configured, set the initial number to a few. * We'll adapt as load dictates. */ WT_RET(__wt_config_gets(session, cfg, "log.prealloc", &cval)); if (cval.val != 0) conn->log_prealloc = 1; /* * Note that it is meaningless to reconfigure this value during * runtime. It only matters on create before recovery runs. */ WT_RET(__wt_config_gets_def(session, cfg, "log.recover", 0, &cval)); if (cval.len != 0 && WT_STRING_MATCH("error", cval.str, cval.len)) FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR); WT_RET(__wt_config_gets(session, cfg, "log.zero_fill", &cval)); if (cval.val != 0) { if (F_ISSET(conn, WT_CONN_READONLY)) WT_RET_MSG(session, EINVAL, "Read-only configuration incompatible with " "zero-filling log files"); FLD_SET(conn->log_flags, WT_CONN_LOG_ZERO_FILL); } WT_RET(__logmgr_sync_cfg(session, cfg)); if (conn->log_cond != NULL) WT_RET(__wt_cond_auto_signal(session, conn->log_cond)); return (0); }
/* * wiredtiger_open -- * Main library entry point: open a new connection to a WiredTiger * database. */ int wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *config, WT_CONNECTION **wt_connp) { static WT_CONNECTION stdc = { __conn_close, __conn_reconfigure, __conn_get_home, __conn_is_new, __conn_open_session, __conn_load_extension, __conn_add_data_source, __conn_add_collator, __conn_add_compressor, __conn_add_extractor }; static struct { const char *name; uint32_t flag; } *ft, directio_types[] = { { "data", WT_DIRECTIO_DATA }, { "log", WT_DIRECTIO_LOG }, { NULL, 0 } }; WT_CONFIG subconfig; WT_CONFIG_ITEM cval, skey, sval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_ITEM *cbuf, expath, exconfig; WT_SESSION_IMPL *session; const char *cfg[] = { __wt_confdfl_wiredtiger_open, config, NULL, NULL, NULL }; int exist; *wt_connp = NULL; session = NULL; cbuf = NULL; WT_CLEAR(expath); WT_CLEAR(exconfig); WT_RET(__wt_library_init()); WT_RET(__wt_calloc_def(NULL, 1, &conn)); conn->iface = stdc; /* * Immediately link the structure into the connection structure list: * the only thing ever looked at on that list is the database name, * and a NULL value is fine. */ __wt_spin_lock(NULL, &__wt_process.spinlock); TAILQ_INSERT_TAIL(&__wt_process.connqh, conn, q); __wt_spin_unlock(NULL, &__wt_process.spinlock); session = conn->default_session = &conn->dummy_session; session->iface.connection = &conn->iface; session->name = "wiredtiger_open"; __wt_event_handler_set(session, event_handler); /* Remaining basic initialization of the connection structure. */ WT_ERR(__wt_connection_init(conn)); /* Check the configuration strings. */ WT_ERR(__wt_config_check( session, __wt_confchk_wiredtiger_open, config, 0)); /* Get the database home. */ WT_ERR(__conn_home(session, home, cfg)); /* Make sure no other thread of control already owns this database. */ WT_ERR(__conn_single(session, cfg)); /* Read the database-home configuration file. */ WT_ERR(__conn_config_file(session, cfg, &cbuf)); /* Read the environment variable configuration. */ WT_ERR(__conn_config_env(session, cfg)); WT_ERR(__wt_config_gets(session, cfg, "hazard_max", &cval)); conn->hazard_max = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval)); conn->session_size = (uint32_t)cval.val + WT_NUM_INTERNAL_SESSIONS; WT_ERR(__wt_config_gets(session, cfg, "lsm_merge", &cval)); if (cval.val) F_SET(conn, WT_CONN_LSM_MERGE); WT_ERR(__wt_config_gets(session, cfg, "sync", &cval)); if (cval.val) F_SET(conn, WT_CONN_SYNC); WT_ERR(__wt_config_gets(session, cfg, "transactional", &cval)); if (cval.val) F_SET(conn, WT_CONN_TRANSACTIONAL); /* Configure verbose flags. */ WT_ERR(__conn_verbose_config(session, cfg)); WT_ERR(__wt_conn_cache_pool_config(session, cfg)); WT_ERR(__wt_config_gets(session, cfg, "logging", &cval)); if (cval.val != 0) WT_ERR(__wt_open( session, WT_LOG_FILENAME, 1, 0, 0, &conn->log_fh)); /* Configure direct I/O and buffer alignment. */ WT_ERR(__wt_config_gets(session, cfg, "buffer_alignment", &cval)); if (cval.val == -1) conn->buffer_alignment = WT_BUFFER_ALIGNMENT_DEFAULT; else conn->buffer_alignment = (size_t)cval.val; #ifndef HAVE_POSIX_MEMALIGN if (conn->buffer_alignment != 0) WT_ERR_MSG(session, EINVAL, "buffer_alignment requires posix_memalign"); #endif /* * Configuration: direct_io, mmap, statistics. */ WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval)); for (ft = directio_types; ft->name != NULL; ft++) { ret = __wt_config_subgets(session, &cval, ft->name, &sval); if (ret == 0) { if (sval.val) FLD_SET(conn->direct_io, ft->flag); } else if (ret != WT_NOTFOUND) goto err; } WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval)); conn->mmap = cval.val == 0 ? 0 : 1; WT_ERR(__wt_config_gets(session, cfg, "statistics", &cval)); conn->statistics = cval.val == 0 ? 0 : 1; /* Load any extensions referenced in the config. */ WT_ERR(__wt_config_gets(session, cfg, "extensions", &cval)); WT_ERR(__wt_config_subinit(session, &subconfig, &cval)); while ((ret = __wt_config_next(&subconfig, &skey, &sval)) == 0) { WT_ERR(__wt_buf_fmt( session, &expath, "%.*s", (int)skey.len, skey.str)); if (sval.len > 0) WT_ERR(__wt_buf_fmt(session, &exconfig, "entry=%.*s\n", (int)sval.len, sval.str)); WT_ERR(conn->iface.load_extension(&conn->iface, expath.data, (sval.len > 0) ? exconfig.data : NULL)); } WT_ERR_NOTFOUND_OK(ret); /* * Open the connection; if that fails, the connection handle has been * destroyed by the time the open function returns. */ if ((ret = __wt_connection_open(conn, cfg)) != 0) { conn = NULL; WT_ERR(ret); } /* Open the default session. */ WT_ERR(__wt_open_session(conn, 1, NULL, NULL, &conn->default_session)); session = conn->default_session; /* * Check on the turtle and metadata files, creating them if necessary * (which avoids application threads racing to create the metadata file * later). */ WT_ERR(__wt_meta_turtle_init(session, &exist)); if (!exist) { /* * We're single-threaded, but acquire the schema lock * regardless: the lower level code checks that it is * appropriately synchronized. */ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_create(session, WT_METADATA_URI, NULL)); WT_ERR(ret); } WT_ERR(__wt_metadata_open(session)); /* If there's a hot-backup file, load it. */ WT_ERR(__wt_metadata_load_backup(session)); /* * XXX LSM initialization. * This is structured so that it could be moved to an extension. */ WT_ERR(__wt_lsm_init(&conn->iface, NULL)); STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0); *wt_connp = &conn->iface; /* * Destroying the connection on error will destroy our session handle, * cleanup using the session handle first, then discard the connection. */ err: if (cbuf != NULL) __wt_buf_free(session, cbuf); __wt_buf_free(session, &expath); __wt_buf_free(session, &exconfig); if (ret != 0 && conn != NULL) WT_TRET(__wt_connection_destroy(conn)); /* Let the server threads proceed. */ if (ret == 0) conn->connection_initialized = 1; return (ret); }
/* * __wt_lsm_tree_create -- * Create an LSM tree structure for the given name. */ int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_TREE *lsm_tree; const char *cfg[] = { WT_CONFIG_BASE(session, session_create), config, NULL }; char *tmpconfig; /* If the tree is open, it already exists. */ if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); /* * If the tree has metadata, it already exists. * * !!! * Use a local variable: we don't care what the existing configuration * is, but we don't want to overwrite the real config. */ if (__wt_metadata_search(session, uri, &tmpconfig) == 0) { __wt_free(session, tmpconfig); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); if (WT_STRING_MATCH("r", cval.str, cval.len)) WT_RET_MSG(session, EINVAL, "LSM trees cannot be configured as column stores"); WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); WT_ERR(__wt_strndup( session, cval.str, cval.len, &lsm_tree->key_format)); WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); WT_ERR(__wt_strndup( session, cval.str, cval.len, &lsm_tree->value_format)); WT_ERR(__wt_config_gets(session, cfg, "collator", &cval)); WT_ERR(__wt_strndup( session, cval.str, cval.len, &lsm_tree->collator_name)); WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval)); if (cval.val) F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); else F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval)); FLD_SET(lsm_tree->bloom, (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval)); if (cval.val != 0) FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)) WT_ERR_MSG(session, EINVAL, "Bloom filters can only be created on newest and oldest " "chunks if bloom filters are enabled"); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval)); if (cval.type == WT_CONFIG_ITEM_STRUCT) { cval.str++; cval.len -= 2; } WT_ERR(__wt_strndup( session, cval.str, cval.len, &lsm_tree->bloom_config)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval)); lsm_tree->bloom_bit_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval)); lsm_tree->bloom_hash_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval)); lsm_tree->chunk_max = (uint64_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval)); lsm_tree->chunk_size = (uint64_t)cval.val; if (lsm_tree->chunk_size > lsm_tree->chunk_max) WT_ERR_MSG(session, EINVAL, "Chunk size (chunk_size) must be smaller than or equal to " "the maximum chunk size (chunk_max)"); WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval)); lsm_tree->merge_max = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_min", &cval)); lsm_tree->merge_min = (uint32_t)cval.val; if (lsm_tree->merge_min > lsm_tree->merge_max) WT_ERR_MSG(session, EINVAL, "LSM merge_min must be less than or equal to merge_max"); /* * Set up the config for each chunk. * * Make the memory_page_max double the chunk size, so application * threads don't immediately try to force evict the chunk when the * worker thread clears the NO_EVICTION flag. */ WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "%s,key_format=u,value_format=u,memory_page_max=%" PRIu64, config, 2 * lsm_tree->chunk_max)); WT_ERR(__wt_strndup( session, buf->data, buf->size, &lsm_tree->file_config)); /* Create the first chunk and flush the metadata. */ WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); /* Discard our partially populated handle. */ ret = __lsm_tree_discard(session, lsm_tree); lsm_tree = NULL; /* * Open our new tree and add it to the handle cache. Don't discard on * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ if (ret == 0) ret = __lsm_tree_open(session, uri, &lsm_tree); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); if (0) { err: WT_TRET(__lsm_tree_discard(session, lsm_tree)); } __wt_scr_free(&buf); return (ret); }
/* * __wt_txn_recover -- * Run recovery. */ int __wt_txn_recover(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *metac; WT_DECL_RET; WT_RECOVERY r; WT_RECOVERY_FILE *metafile; char *config; bool do_checkpoint, eviction_started, needs_rec, was_backup; conn = S2C(session); WT_CLEAR(r); WT_INIT_LSN(&r.ckpt_lsn); config = NULL; do_checkpoint = true; eviction_started = false; was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP); /* We need a real session for recovery. */ WT_RET(__wt_open_internal_session(conn, "txn-recover", false, WT_SESSION_NO_LOGGING, &session)); r.session = session; WT_MAX_LSN(&r.max_ckpt_lsn); WT_MAX_LSN(&r.max_rec_lsn); conn->txn_global.recovery_timestamp = conn->txn_global.meta_ckpt_timestamp = 0; F_SET(conn, WT_CONN_RECOVERING); WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac)); metafile = &r.files[WT_METAFILE_ID]; metafile->c = metac; /* * If no log was found (including if logging is disabled), or if the * last checkpoint was done with logging disabled, recovery should not * run. Scan the metadata to figure out the largest file ID. */ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) { /* * Detect if we're going from logging disabled to enabled. * We need to know this to verify LSNs and start at the correct * log file later. If someone ran with logging, then disabled * it and removed all the log files and then turned logging back * on, we have to start logging in the log file number that is * larger than any checkpoint LSN we have from the earlier time. */ WT_ERR(__recovery_file_scan(&r)); /* * The array can be re-allocated in recovery_file_scan. Reset * our pointer after scanning all the files. */ metafile = &r.files[WT_METAFILE_ID]; conn->next_file_id = r.max_fileid; if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && WT_IS_MAX_LSN(&metafile->ckpt_lsn) && !WT_IS_MAX_LSN(&r.max_ckpt_lsn)) WT_ERR(__wt_log_reset(session, r.max_ckpt_lsn.l.file)); else do_checkpoint = false; goto done; } /* * First, do a pass through the log to recover the metadata, and * establish the last checkpoint LSN. Skip this when opening a hot * backup: we already have the correct metadata in that case. * * If we're running with salvage and we hit an error, we ignore it * and continue. In salvage we want to recover whatever part of the * data we can from the last checkpoint up until whatever problem we * detect in the log file. In salvage, we ignore errors from scanning * the log so recovery can continue. Other errors remain errors. */ if (!was_backup) { r.metadata_only = true; /* * If this is a read-only connection, check if the checkpoint * LSN in the metadata file is up to date, indicating a clean * shutdown. */ if (F_ISSET(conn, WT_CONN_READONLY)) { WT_ERR(__wt_log_needs_recovery( session, &metafile->ckpt_lsn, &needs_rec)); if (needs_rec) WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery"); } if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) ret = __wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r); else { /* * Start at the last checkpoint LSN referenced in the * metadata. If we see the end of a checkpoint while * scanning, we will change the full scan to start from * there. */ r.ckpt_lsn = metafile->ckpt_lsn; ret = __wt_log_scan(session, &metafile->ckpt_lsn, 0, __txn_log_recover, &r); } if (F_ISSET(conn, WT_CONN_SALVAGE)) ret = 0; /* * If log scan couldn't find a file we expected to be around, * this indicates a corruption of some sort. */ if (ret == ENOENT) { F_SET(conn, WT_CONN_DATA_CORRUPTION); ret = WT_ERROR; } WT_ERR(ret); } /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); /* * Clear this out. We no longer need it and it could have been * re-allocated when scanning the files. */ WT_NOT_READ(metafile, NULL); /* * We no longer need the metadata cursor: close it to avoid pinning any * resources that could block eviction during recovery. */ r.files[0].c = NULL; WT_ERR(metac->close(metac)); /* * Now, recover all the files apart from the metadata. * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = false; __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS, "Main recovery loop: starting at %" PRIu32 "/%" PRIu32 " to %" PRIu32 "/%" PRIu32, r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset, r.max_rec_lsn.l.file, r.max_rec_lsn.l.offset); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); /* * Check if the database was shut down cleanly. If not * return an error if the user does not want automatic * recovery. */ if (needs_rec && (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) || F_ISSET(conn, WT_CONN_READONLY))) { if (F_ISSET(conn, WT_CONN_READONLY)) WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery"); WT_ERR_MSG(session, WT_RUN_RECOVERY, "Database needs recovery"); } if (F_ISSET(conn, WT_CONN_READONLY)) { do_checkpoint = false; goto done; } /* * Recovery can touch more data than fits in cache, so it relies on * regular eviction to manage paging. Start eviction threads for * recovery without LAS cursors. */ WT_ERR(__wt_evict_create(session)); eviction_started = true; /* * Always run recovery even if it was a clean shutdown only if * this is not a read-only connection. * We can consider skipping it in the future. */ if (needs_rec) FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY); if (WT_IS_INIT_LSN(&r.ckpt_lsn)) ret = __wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r); else ret = __wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r); if (F_ISSET(conn, WT_CONN_SALVAGE)) ret = 0; WT_ERR(ret); conn->next_file_id = r.max_fileid; done: WT_ERR(__recovery_set_checkpoint_timestamp(&r)); if (do_checkpoint) /* * Forcibly log a checkpoint so the next open is fast and keep * the metadata up to date with the checkpoint LSN and * archiving. */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); /* * If we're downgrading and have newer log files, force an archive, * no matter what the archive setting is. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) WT_ERR(__wt_log_truncate_files(session, NULL, true)); FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE); err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); FLD_CLR(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY); if (ret != 0) { FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_FAILED); __wt_err(session, ret, "Recovery failed"); } /* * Destroy the eviction threads that were started in support of * recovery. They will be restarted once the lookaside table is * created. */ if (eviction_started) WT_TRET(__wt_evict_destroy(session)); WT_TRET(session->iface.close(&session->iface, NULL)); F_CLR(conn, WT_CONN_RECOVERING); return (ret); }
/* * __logmgr_config -- * Parse and setup the logging server options. */ static int __logmgr_config( WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool reconfig) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; bool enabled; /* * A note on reconfiguration: the standard "is this configuration string * allowed" checks should fail if reconfiguration has invalid strings, * for example, "log=(enabled)", or "statistics_log=(path=XXX)", because * the connection reconfiguration method doesn't allow those strings. * Additionally, the base configuration values during reconfiguration * are the currently configured values (so we don't revert to default * values when repeatedly reconfiguring), and configuration processing * of a currently set value should not change the currently set value. * * In this code path, log server reconfiguration does not stop/restart * the log server, so there's no point in re-evaluating configuration * strings that cannot be reconfigured, risking bugs in configuration * setup, and depending on evaluation of currently set values to always * result in the currently set value. Skip tests for any configuration * strings which don't make sense during reconfiguration, but don't * worry about error reporting because it should never happen. */ conn = S2C(session); WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); enabled = cval.val != 0; /* * If we're reconfiguring, enabled must match the already * existing setting. * * If it is off and the user it turning it on, or it is on * and the user is turning it off, return an error. * * See above: should never happen. */ if (reconfig && ((enabled && !FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) || (!enabled && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)))) return (EINVAL); /* Logging is incompatible with in-memory */ if (enabled) { WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval)); if (cval.val != 0) WT_RET_MSG(session, EINVAL, "In memory configuration incompatible with " "log=(enabled=true)"); } *runp = enabled; /* * Setup a log path and compression even if logging is disabled in case * we are going to print a log. Only do this on creation. Once a * compressor or log path are set they cannot be changed. * * See above: should never happen. */ if (!reconfig) { conn->log_compressor = NULL; WT_RET(__wt_config_gets_none( session, cfg, "log.compressor", &cval)); WT_RET(__wt_compressor_config( session, &cval, &conn->log_compressor)); WT_RET(__wt_config_gets(session, cfg, "log.path", &cval)); WT_RET(__wt_strndup( session, cval.str, cval.len, &conn->log_path)); } /* We are done if logging isn't enabled. */ if (!*runp) return (0); WT_RET(__wt_config_gets(session, cfg, "log.archive", &cval)); if (cval.val != 0) FLD_SET(conn->log_flags, WT_CONN_LOG_ARCHIVE); /* * The file size cannot be reconfigured. The amount of memory allocated * to the log slots may be based on the log file size at creation and we * don't want to re-allocate that memory while running. * * See above: should never happen. */ if (!reconfig) { WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval)); conn->log_file_max = (wt_off_t)cval.val; WT_STAT_FAST_CONN_SET(session, log_max_filesize, conn->log_file_max); } /* * If pre-allocation is configured, set the initial number to a few. * We'll adapt as load dictates. */ WT_RET(__wt_config_gets(session, cfg, "log.prealloc", &cval)); if (cval.val != 0) conn->log_prealloc = 1; /* * Note it's meaningless to reconfigure this value during runtime, it * only matters on create before recovery runs. * * See above: should never happen. */ if (!reconfig) { WT_RET(__wt_config_gets_def( session, cfg, "log.recover", 0, &cval)); if (WT_STRING_MATCH("error", cval.str, cval.len)) FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR); } WT_RET(__wt_config_gets(session, cfg, "log.zero_fill", &cval)); if (cval.val != 0) { if (F_ISSET(conn, WT_CONN_READONLY)) WT_RET_MSG(session, EINVAL, "Read-only configuration incompatible with " "zero-filling log files"); FLD_SET(conn->log_flags, WT_CONN_LOG_ZERO_FILL); } WT_RET(__logmgr_sync_cfg(session, cfg)); if (conn->log_cond != NULL) WT_RET(__wt_cond_auto_signal(session, conn->log_cond)); return (0); }
/* * __thread_group_resize -- * Resize an array of utility threads already holding the lock. */ static int __thread_group_resize( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, uint32_t new_min, uint32_t new_max, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_THREAD *thread; size_t alloc; uint32_t i, session_flags; conn = S2C(session); session_flags = 0; WT_ASSERT(session, group->current_threads <= group->alloc && __wt_rwlock_islocked(session, group->lock)); if (new_min == group->min && new_max == group->max) return (0); /* * Coll shrink to reduce the number of thread structures and running * threads if required by the change in group size. */ WT_RET(__thread_group_shrink(session, group, new_max)); /* * Only reallocate the thread array if it is the largest ever, since * our realloc doesn't support shrinking the allocated size. */ if (group->alloc < new_max) { alloc = group->alloc * sizeof(*group->threads); WT_RET(__wt_realloc(session, &alloc, new_max * sizeof(*group->threads), &group->threads)); group->alloc = new_max; } /* * Initialize the structures based on the previous group size, not * the previous allocated size. */ for (i = group->max; i < new_max; i++) { WT_ERR(__wt_calloc_one(session, &thread)); /* * Threads get their own session and lookaside table cursor * if the lookaside table is open. Note that threads are * started during recovery, before the lookaside table is * created. */ if (LF_ISSET(WT_THREAD_CAN_WAIT)) session_flags = WT_SESSION_CAN_WAIT; if (F_ISSET(conn, WT_CONN_LAS_OPEN)) FLD_SET(session_flags, WT_SESSION_LOOKASIDE_CURSOR); WT_ERR(__wt_open_internal_session(conn, group->name, false, session_flags, &thread->session)); if (LF_ISSET(WT_THREAD_PANIC_FAIL)) F_SET(thread, WT_THREAD_PANIC_FAIL); thread->id = i; thread->run_func = group->run_func; WT_ASSERT(session, group->threads[i] == NULL); group->threads[i] = thread; } if (group->current_threads < new_min) WT_ERR(__thread_group_grow(session, group, new_min)); err: /* * Update the thread group information even on failure to improve our * chances of cleaning up properly. */ group->max = new_max; group->min = new_min; /* * An error resizing a thread array is fatal, it should only happen * in an out of memory situation. */ if (ret != 0) { WT_TRET(__wt_thread_group_destroy(session, group)); WT_PANIC_RET(session, ret, "Error while resizing thread group"); } return (ret); }
/* * __wt_lsm_tree_create -- * Create an LSM tree structure for the given name. */ int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_TREE *lsm_tree; const char *cfg[] = { WT_CONFIG_BASE(session, session_create), config, NULL }; const char *tmpconfig; /* If the tree is open, it already exists. */ if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); /* * If the tree has metadata, it already exists. * * !!! * Use a local variable: we don't care what the existing configuration * is, but we don't want to overwrite the real config. */ if (__wt_metadata_search(session, uri, &tmpconfig) == 0) { __wt_free(session, tmpconfig); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); if (WT_STRING_MATCH("r", cval.str, cval.len)) WT_RET_MSG(session, EINVAL, "LSM trees cannot be configured as column stores"); WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->key_format)); WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->value_format)); WT_ERR(__wt_config_gets(session, cfg, "collator", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->collator_name)); WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval)); if (cval.val) F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); else F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval)); FLD_SET(lsm_tree->bloom, (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval)); if (cval.val != 0) FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)) WT_ERR_MSG(session, EINVAL, "Bloom filters can only be created on newest and oldest " "chunks if bloom filters are enabled"); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval)); if (cval.type == WT_CONFIG_ITEM_STRUCT) { cval.str++; cval.len -= 2; } WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->bloom_config)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval)); lsm_tree->bloom_bit_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval)); lsm_tree->bloom_hash_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval)); lsm_tree->chunk_max = (uint64_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval)); lsm_tree->chunk_size = (uint64_t)cval.val; if (lsm_tree->chunk_size > lsm_tree->chunk_max) WT_ERR_MSG(session, EINVAL, "Chunk size (chunk_size) must be smaller than or equal to " "the maximum chunk size (chunk_max)"); WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval)); lsm_tree->merge_max = (uint32_t)cval.val; lsm_tree->merge_min = lsm_tree->merge_max / 2; WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_threads", &cval)); lsm_tree->merge_threads = (uint32_t)cval.val; /* Sanity check that api_data.py is in sync with lsm.h */ WT_ASSERT(session, lsm_tree->merge_threads <= WT_LSM_MAX_WORKERS); /* * Set up the config for each chunk. If possible, avoid high latencies * from fsync by flushing the cache every 8MB (will be overridden by * any application setting). */ tmpconfig = ""; #ifdef HAVE_SYNC_FILE_RANGE if (!S2C(session)->direct_io) tmpconfig = "os_cache_dirty_max=8MB,"; #endif WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "%s%s,key_format=u,value_format=u", tmpconfig, config)); lsm_tree->file_config = __wt_buf_steal(session, buf, NULL); /* Create the first chunk and flush the metadata. */ WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); /* Discard our partially populated handle. */ ret = __lsm_tree_discard(session, lsm_tree); lsm_tree = NULL; /* * Open our new tree and add it to the handle cache. Don't discard on * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ if (ret == 0) ret = __lsm_tree_open(session, uri, &lsm_tree); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); if (0) { err: WT_TRET(__lsm_tree_discard(session, lsm_tree)); } __wt_scr_free(&buf); return (ret); }
/* * __wt_txn_recover -- * Run recovery. */ int __wt_txn_recover(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *metac; WT_DECL_RET; WT_RECOVERY r; struct WT_RECOVERY_FILE *metafile; char *config; int needs_rec, was_backup; conn = S2C(session); WT_CLEAR(r); WT_INIT_LSN(&r.ckpt_lsn); was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0; /* We need a real session for recovery. */ WT_RET(__wt_open_session(conn, NULL, NULL, 1, &session)); F_SET(session, WT_SESSION_NO_LOGGING); r.session = session; WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); WT_ERR(__wt_metadata_cursor(session, NULL, &metac)); metafile = &r.files[WT_METAFILE_ID]; metafile->c = metac; /* * If no log was found (including if logging is disabled), or if the * last checkpoint was done with logging disabled, recovery should not * run. Scan the metadata to figure out the largest file ID. */ if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) { WT_ERR(__recovery_file_scan(&r)); conn->next_file_id = r.max_fileid; goto done; } /* * First, do a pass through the log to recover the metadata, and * establish the last checkpoint LSN. Skip this when opening a hot * backup: we already have the correct metadata in that case. */ if (!was_backup) { r.metadata_only = 1; if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); else { /* * Start at the last checkpoint LSN referenced in the * metadata. If we see the end of a checkpoint while * scanning, we will change the full scan to start from * there. */ r.ckpt_lsn = metafile->ckpt_lsn; ret = __wt_log_scan(session, &metafile->ckpt_lsn, 0, __txn_log_recover, &r); if (ret == ENOENT) ret = 0; WT_ERR(ret); } } /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); /* * We no longer need the metadata cursor: close it to avoid pinning any * resources that could block eviction during recovery. */ r.files[0].c = NULL; WT_ERR(metac->close(metac)); /* * Now, recover all the files apart from the metadata. * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = 0; WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, "Main recovery loop: starting at %u/%" PRIuMAX, r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset)); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); /* * Check if the database was shut down cleanly. If not * return an error if the user does not want automatic * recovery. */ if (needs_rec && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR)) WT_ERR(WT_RUN_RECOVERY); /* * Always run recovery even if it was a clean shutdown. * We can consider skipping it in the future. */ if (WT_IS_INIT_LSN(&r.ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); else { ret = __wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r); if (ret == ENOENT) ret = 0; WT_ERR(ret); } conn->next_file_id = r.max_fileid; /* * If recovery ran successfully forcibly log a checkpoint so the next * open is fast and keep the metadata up to date with the checkpoint * LSN and archiving. */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); done: FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE); err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); WT_TRET(session->iface.close(&session->iface, NULL)); return (ret); }