/* * __wt_curstat_cache_walk -- * Initialize the statistics for a cache cache_walk pass. */ void __wt_curstat_cache_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_PAGE_INDEX *root_idx; btree = S2BT(session); conn = S2C(session); /* Set statistics that don't require walking the cache. */ WT_STAT_DATA_SET(session, cache_state_gen_current, conn->cache->evict_pass_gen); /* Root page statistics */ root_idx = WT_INTL_INDEX_GET_SAFE(btree->root.page); WT_STAT_DATA_SET(session, cache_state_root_entries, root_idx->entries); WT_STAT_DATA_SET(session, cache_state_root_size, btree->root.page->memory_footprint); WT_WITH_HANDLE_LIST_LOCK(session, __evict_stat_walk(session)); }
/* * __backup_log_append -- * Append log files needed for backup. */ static int __backup_log_append(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, int active) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; u_int i, logcount; char **logfiles; conn = S2C(session); logfiles = NULL; logcount = 0; ret = 0; if (conn->log) { WT_ERR(__wt_log_get_all_files( session, &logfiles, &logcount, &cb->maxid, active)); for (i = 0; i < logcount; i++) WT_ERR(__backup_list_append(session, cb, logfiles[i])); } err: if (logfiles != NULL) __wt_log_files_free(session, logfiles, logcount); return (ret); }
/* * __wt_txn_init -- * Initialize a session's transaction data. */ int __wt_txn_init(WT_SESSION_IMPL *session) { WT_TXN *txn; txn = &session->txn; txn->id = WT_TXN_NONE; WT_RET(__wt_calloc_def(session, S2C(session)->session_size, &txn->snapshot)); /* * Take care to clean these out in case we are reusing the transaction * for eviction. */ txn->mod = NULL; txn->modref = NULL; /* The default isolation level is read-committed. */ txn->isolation = session->isolation = TXN_ISO_READ_COMMITTED; return (0); }
/*读取配置信息,将配置信息设置到当前conn的cache信息*/ int __wt_cache_config(WT_SESSION_IMPL *session, int reconfigure, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; int now_shared, was_shared; conn = S2C(session); WT_ASSERT(session, conn->cache != NULL); WT_RET(__wt_config_gets_none(session, cfg, "shared_cache.name", &cval)); now_shared = cval.len != 0; was_shared = F_ISSET(conn, WT_CONN_CACHE_POOL); /* Cleanup if reconfiguring */ if (reconfigure && was_shared && !now_shared) /* Remove ourselves from the pool if necessary */ WT_RET(__wt_conn_cache_pool_destroy(session)); /*如果原来是cache pool管理connection cache,现在的配置设置成独立的cache管理,那么从cache pool中删除管理关系*/ else if (reconfigure && !was_shared && now_shared) /* * Cache size will now be managed by the cache pool - the * start size always needs to be zero to allow the pool to * manage how much memory is in-use. */ conn->cache_size = 0; /*配置connection的cache*/ WT_RET(__cache_config_local(session, now_shared, cfg)); if (now_shared) { WT_RET(__wt_cache_pool_config(session, cfg)); /*对cache pool的配置更新*/ WT_ASSERT(session, F_ISSET(conn, WT_CONN_CACHE_POOL)); if (!was_shared) WT_RET(__wt_conn_cache_pool_open(session)); /*将connection cache加入到cache pool当中进行管理*/ } return 0; }
/* * __wt_spin_lock_register_caller -- * Register a spin-lock caller's location information in the blocking * matrix. */ int __wt_spin_lock_register_caller(WT_SESSION_IMPL *session, const char *name, const char *file, int line, int *idp) { WT_CONNECTION_IMPL *conn; WT_CONNECTION_STATS_SPINLOCK *p; conn = S2C(session); /* * The caller's location ID is a static offset into a per-connection * structure, and that has problems: first, if there are multiple * connections, we'll need to hold some kind of lock to avoid racing * when setting that value, and second, if/when there are multiple * connections and/or a single connection is closed and re-opened, the * variable may be initialized and underlying connection information * may not. * * First, allocate a location ID if needed. */ WT_RET(__spin_lock_next_id(session, idp)); /* * Add the caller's information to the blocking matrix. We could race * here (if two threads of control register the same lock at the same * time), but we don't care as both threads are setting the identical * information. */ p = &conn->spinlock_block[*idp]; p->name = name; if ((p->file = strrchr(file, '/')) == NULL) p->file = file; else ++p->file; p->line = line; return (0); }
/* * __wt_las_create -- * Initialize the database's lookaside store. */ int __wt_las_create(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; const char *drop_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL }; conn = S2C(session); /* * Done at startup: we cannot do it on demand because we require the * schema lock to create and drop the file, and it may not always be * available. * * Open an internal session, used for the shared lookaside cursor. * * Sessions associated with a lookaside cursor should never be tapped * for eviction. */ WT_RET(__wt_open_internal_session( conn, "lookaside table", 1, 1, &conn->las_session)); session = conn->las_session; F_SET(session, WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION); /* Discard any previous incarnation of the file. */ WT_RET(__wt_session_drop(session, WT_LAS_URI, drop_cfg)); /* Re-create the file. */ WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT)); /* Open the shared cursor. */ WT_WITHOUT_DHANDLE(session, ret = __las_cursor_create(session, &conn->las_cursor)); return (ret); }
/* * __wt_optrack_record_funcid -- * Allocate and record optrack function ID. */ void __wt_optrack_record_funcid( WT_SESSION_IMPL *session, const char *func, uint16_t *func_idp) { static uint16_t optrack_uid = 0; /* Unique for the process lifetime. */ WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(tmp); WT_DECL_RET; wt_off_t fsize; bool locked; conn = S2C(session); locked = false; WT_ERR(__wt_scr_alloc(session, strlen(func) + 32, &tmp)); __wt_spin_lock(session, &conn->optrack_map_spinlock); locked = true; if (*func_idp == 0) { *func_idp = ++optrack_uid; WT_ERR(__wt_buf_fmt( session, tmp, "%" PRIu16 " %s\n", *func_idp, func)); WT_ERR(__wt_filesize(session, conn->optrack_map_fh, &fsize)); WT_ERR(__wt_write(session, conn->optrack_map_fh, fsize, tmp->size, tmp->data)); } if (0) { err: WT_PANIC_MSG(session, ret, "operation tracking initialization failure"); } if (locked) __wt_spin_unlock(session, &conn->optrack_map_spinlock); __wt_scr_free(session, &tmp); }
/* * __wt_checkpoint_server_destroy -- * Destroy the checkpoint server thread. */ int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION *wt_session; conn = S2C(session); F_CLR(conn, WT_CONN_SERVER_CHECKPOINT); if (conn->ckpt_tid_set) { WT_TRET(__wt_cond_signal(session, conn->ckpt_cond)); WT_TRET(__wt_thread_join(session, conn->ckpt_tid)); conn->ckpt_tid_set = 0; } WT_TRET(__wt_cond_destroy(session, &conn->ckpt_cond)); __wt_free(session, conn->ckpt_config); /* Close the server thread's session. */ if (conn->ckpt_session != NULL) { wt_session = &conn->ckpt_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); } /* * Ensure checkpoint settings are cleared - so that reconfigure doesn't * get confused. */ conn->ckpt_session = NULL; conn->ckpt_tid_set = 0; conn->ckpt_cond = NULL; conn->ckpt_config = NULL; conn->ckpt_usecs = 0; return (ret); }
/* * __wt_session_fotxn_add -- * Add a new entry into the session's free-on-transaction generation list. */ int __wt_session_fotxn_add(WT_SESSION_IMPL *session, void *p, size_t len) { WT_FOTXN *fotxn; size_t i; /* * Make sure the current thread has a transaction pinned so that * we don't immediately free the memory we are stashing. */ WT_ASSERT(session, WT_SESSION_TXN_STATE(session)->snap_min != WT_TXN_NONE); /* Grow the list as necessary. */ WT_RET(__wt_realloc_def(session, &session->fotxn_size, session->fotxn_cnt + 1, &session->fotxn)); /* Find an empty slot. */ for (i = 0, fotxn = session->fotxn; i < session->fotxn_size / sizeof(session->fotxn[0]); ++i, ++fotxn) if (fotxn->p == NULL) { fotxn->txnid = S2C(session)->txn_global.current + 1; WT_ASSERT(session, !__wt_txn_visible_all(session, fotxn->txnid)); fotxn->p = p; fotxn->len = len; break; } ++session->fotxn_cnt; /* See if we can free any previous entries. */ if (session->fotxn_cnt > 1) __wt_session_fotxn_discard(session, session, 0); return (0); }
/* * __wt_block_close -- * Close a block handle. */ int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; if (block == NULL) /* Safety check */ return (0); conn = S2C(session); WT_TRET(__wt_verbose(session, WT_VERB_BLOCK, "close: %s", block->name == NULL ? "" : block->name )); __wt_spin_lock(session, &conn->block_lock); /* Reference count is initialized to 1. */ if (block->ref == 0 || --block->ref == 0) WT_TRET(__block_destroy(session, block)); __wt_spin_unlock(session, &conn->block_lock); return (ret); }
/* * __ckpt_server_config -- * Parse and setup the checkpoint server options. */ static int __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, int *startp) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(tmp); WT_DECL_RET; conn = S2C(session); /* * The checkpoint configuration requires a wait time -- if it's not set, * we're not running at all. */ WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval)); if (cval.val == 0) { *startp = 0; return (0); } conn->ckpt_usecs = (long)cval.val * 1000000; *startp = 1; WT_RET(__wt_config_gets(session, cfg, "checkpoint.name", &cval)); if (!WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) { WT_RET(__wt_scr_alloc(session, cval.len + 20, &tmp)); strcpy((char *)tmp->data, "name="); strncat((char *)tmp->data, cval.str, cval.len); ret = __wt_strndup(session, tmp->data, strlen("name=") + cval.len, &conn->ckpt_config); __wt_scr_free(&tmp); WT_RET(ret); } return (0); }
/* * __wt_log_slot_close -- * Close a slot and do not allow any other threads to join this slot. * Remove this from the active slot array and move a new slot from * the pool into its place. Set up the size of this group; * Must be called with the logging spinlock held. */ int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *newslot; int64_t old_state; conn = S2C(session); log = conn->log; /* * Find an unused slot in the pool. */ WT_RET(__log_slot_find_free(session, &newslot)); /* * Swap out the slot we're going to use and put a free one in the * slot array in its place so that threads can use it right away. */ WT_STAT_FAST_CONN_INCR(session, log_slot_closes); newslot->slot_state = WT_LOG_SLOT_READY; newslot->slot_index = slot->slot_index; log->slot_array[newslot->slot_index] = newslot; old_state = WT_ATOMIC_STORE8(slot->slot_state, WT_LOG_SLOT_PENDING); slot->slot_group_size = (uint64_t)(old_state - WT_LOG_SLOT_READY); /* * Note that this statistic may be much bigger than in reality, * especially when compared with the total bytes written in * __log_fill. The reason is that this size reflects any * rounding up that is needed and the total bytes in __log_fill * is the amount of user bytes. */ WT_STAT_FAST_CONN_INCRV(session, log_slot_consolidated, (uint64_t)slot->slot_group_size); return (0); }
/* * __wt_free_int -- * ANSI free function. */ void __wt_free_int(WT_SESSION_IMPL *session, void *p_arg) { void *p; /* * !!! * This function MUST handle a NULL WT_SESSION_IMPL handle. */ if (session != NULL && S2C(session)->stats != NULL) WT_CSTAT_INCR(session, memfree); /* * If there's a serialization bug we might race with another thread. * We can't avoid the race (and we aren't willing to flush memory), * but we minimize the window by clearing the free address atomically, * hoping a racing thread will see, and won't free, a NULL pointer. */ p = *(void **)p_arg; *(void **)p_arg = NULL; if (p != NULL) /* ANSI C free semantics */ free(p); }
/* * __metadata_load_hot_backup -- * Load the contents of any hot backup file. */ static int __metadata_load_hot_backup(WT_SESSION_IMPL *session) { WT_DECL_ITEM(key); WT_DECL_ITEM(value); WT_DECL_RET; WT_FSTREAM *fs; bool exist; /* Look for a hot backup file: if we find it, load it. */ WT_RET(__wt_fs_exist(session, WT_METADATA_BACKUP, &exist)); if (!exist) return (0); WT_RET(__wt_fopen(session, WT_METADATA_BACKUP, 0, WT_STREAM_READ, &fs)); /* Read line pairs and load them into the metadata file. */ WT_ERR(__wt_scr_alloc(session, 512, &key)); WT_ERR(__wt_scr_alloc(session, 512, &value)); for (;;) { WT_ERR(__wt_getline(session, fs, key)); if (key->size == 0) break; WT_ERR(__wt_getline(session, fs, value)); if (value->size == 0) WT_ERR(__wt_illegal_value(session, WT_METADATA_BACKUP)); WT_ERR(__wt_metadata_update(session, key->data, value->data)); } F_SET(S2C(session), WT_CONN_WAS_BACKUP); err: WT_TRET(__wt_fclose(session, &fs)); __wt_scr_free(session, &key); __wt_scr_free(session, &value); return (ret); }
/* * __log_slot_dump -- * Dump the entire slot state. */ static void __log_slot_dump(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *slot; int earliest, i; conn = S2C(session); log = conn->log; earliest = 0; for (i = 0; i < WT_SLOT_POOL; i++) { slot = &log->slot_pool[i]; if (__wt_log_cmp(&slot->slot_release_lsn, &log->slot_pool[earliest].slot_release_lsn) < 0) earliest = i; __wt_errx(session, "Slot %d:", i); __wt_errx(session, " State: %" PRIx64 " Flags: %" PRIx32, (uint64_t)slot->slot_state, slot->flags); __wt_errx(session, " Start LSN: %" PRIu32 "/%" PRIu32, slot->slot_start_lsn.l.file, slot->slot_start_lsn.l.offset); __wt_errx(session, " End LSN: %" PRIu32 "/%" PRIu32, slot->slot_end_lsn.l.file, slot->slot_end_lsn.l.offset); __wt_errx(session, " Release LSN: %" PRIu32 "/%" PRIu32, slot->slot_release_lsn.l.file, slot->slot_release_lsn.l.offset); __wt_errx(session, " Offset: start: %" PRIuMAX " last:%" PRIuMAX, (uintmax_t)slot->slot_start_offset, (uintmax_t)slot->slot_last_offset); __wt_errx(session, " Unbuffered: %" PRId64 " error: %" PRId32, slot->slot_unbuffered, slot->slot_error); } __wt_errx(session, "Earliest slot: %d", earliest); }
/* * __wt_las_cursor -- * Return a lookaside cursor. */ void __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags) { WT_CONNECTION_IMPL *conn; *cursorp = NULL; /* * We don't want to get tapped for eviction after we start using the * lookaside cursor; save a copy of the current eviction state, we'll * turn eviction off before we return. * * Don't cache lookaside table pages, we're here because of eviction * problems and there's no reason to believe lookaside pages will be * useful more than once. */ *session_flags = F_MASK(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); conn = S2C(session); /* * Some threads have their own lookaside table cursors, else lock the * shared lookaside cursor. */ if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) *cursorp = session->las_cursor; else { __wt_spin_lock(session, &conn->las_lock); *cursorp = conn->las_session->las_cursor; } /* Turn caching and eviction off. */ F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); }
int __wt_schema_rename(WT_SESSION_IMPL *session, const char *uri, const char *newuri, const char *cfg[]) { WT_DATA_SOURCE *dsrc; WT_DECL_RET; const char *p, *t; /* The target type must match the source type. 匹配前面的关键字,例如file: table:*/ for (p = uri, t = newuri; *p == *t && *p != ':'; ++p, ++t) ; if (*p != ':' || *t != ':') WT_RET_MSG(session, EINVAL, "rename target type must match URI: %s to %s", uri, newuri); /* * We track rename operations, if we fail in the middle, we want to * back it all out. */ WT_RET(__wt_meta_track_on(session)); if (WT_PREFIX_MATCH(uri, "file:")) ret = __rename_file(session, uri, newuri); else if (WT_PREFIX_MATCH(uri, "lsm:")) ret = __wt_lsm_tree_rename(session, uri, newuri, cfg); else if (WT_PREFIX_MATCH(uri, "table:")) ret = __rename_table(session, uri, newuri, cfg); else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) ret = dsrc->rename == NULL ? __wt_object_unsupported(session, uri) : dsrc->rename(dsrc, &session->iface, uri, newuri, (WT_CONFIG_ARG *)cfg); else ret = __wt_bad_object_type(session, uri); /* Bump the schema generation so that stale data is ignored. */ ++S2C(session)->schema_gen; WT_TRET(__wt_meta_track_off(session, 1, ret != 0)); return (ret == WT_NOTFOUND ? ENOENT : ret); }
/* * __wt_spin_lock_unregister_lock -- * Remove a lock from the connection's list. */ void __wt_spin_lock_unregister_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) { WT_CONNECTION_IMPL *conn; u_int i; conn = S2C(session); for (i = 0; i < WT_SPINLOCK_MAX; i++) if (conn->spinlock_list[i] == t) conn->spinlock_list[i] = NULL; /* * XXX * The statistics thread reads through this array, there's a possible * race: if that thread reads the pointer then goes to sleep, then we * free the spinlock, then the statistics thread wakes up, it can read * free'd memory. * * This is performance debugging code, so we're not fixing the race for * now, minimize the window. */ WT_FULL_BARRIER(); }
/*根据cache的状态更新cache的统计信息*/ void __wt_cache_stats_update(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_CONNECTION_STATS *stats; conn = S2C(session); cache = conn->cache; stats = &conn->stats; WT_STAT_SET(stats, cache_bytes_max, conn->cache_size); WT_STAT_SET(stats, cache_bytes_inuse, __wt_cache_bytes_inuse(cache)); WT_STAT_SET(stats, cache_overhead, cache->overhead_pct); WT_STAT_SET(stats, cache_pages_inuse, __wt_cache_pages_inuse(cache)); WT_STAT_SET(stats, cache_bytes_dirty, __wt_cache_dirty_inuse(cache)); WT_STAT_SET(stats, cache_eviction_maximum_page_size, cache->evict_max_page_size); WT_STAT_SET(stats, cache_pages_dirty, cache->pages_dirty); /* Figure out internal, leaf and overflow stats */ WT_STAT_SET(stats, cache_bytes_internal, cache->bytes_internal); WT_STAT_SET(stats, cache_bytes_leaf, conn->cache_size - (cache->bytes_internal + cache->bytes_overflow)); WT_STAT_SET(stats, cache_bytes_overflow, cache->bytes_overflow); }
/* * __logmgr_sync_cfg -- * Interpret the transaction_sync config. */ static int __logmgr_sync_cfg(WT_SESSION_IMPL *session, const char **cfg) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; conn = S2C(session); WT_RET( __wt_config_gets(session, cfg, "transaction_sync.enabled", &cval)); if (cval.val) FLD_SET(conn->txn_logsync, WT_LOG_FLUSH); else FLD_CLR(conn->txn_logsync, WT_LOG_FLUSH); WT_RET( __wt_config_gets(session, cfg, "transaction_sync.method", &cval)); FLD_CLR(conn->txn_logsync, WT_LOG_DSYNC | WT_LOG_FSYNC); if (WT_STRING_MATCH("dsync", cval.str, cval.len)) FLD_SET(conn->txn_logsync, WT_LOG_DSYNC); else if (WT_STRING_MATCH("fsync", cval.str, cval.len)) FLD_SET(conn->txn_logsync, WT_LOG_FSYNC); return (0); }
/* * __wt_logmgr_destroy -- * Destroy the log archiving server thread and logging subsystem. */ int __wt_logmgr_destroy(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION *wt_session; conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) { /* * We always set up the log_path so printlog can work without * recovery. Therefore, always free it, even if logging isn't * on. */ __wt_free(session, conn->log_path); return (0); } if (conn->log_tid_set) { WT_TRET(__wt_cond_signal(session, conn->log_cond)); WT_TRET(__wt_thread_join(session, conn->log_tid)); conn->log_tid_set = 0; } if (conn->log_file_tid_set) { WT_TRET(__wt_cond_signal(session, conn->log_file_cond)); WT_TRET(__wt_thread_join(session, conn->log_file_tid)); conn->log_file_tid_set = 0; } if (conn->log_file_session != NULL) { wt_session = &conn->log_file_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); conn->log_file_session = NULL; } if (conn->log_wrlsn_tid_set) { WT_TRET(__wt_cond_signal(session, conn->log_wrlsn_cond)); WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid)); conn->log_wrlsn_tid_set = 0; } if (conn->log_wrlsn_session != NULL) { wt_session = &conn->log_wrlsn_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); conn->log_wrlsn_session = NULL; } WT_TRET(__wt_log_slot_destroy(session)); WT_TRET(__wt_log_close(session)); /* Close the server thread's session. */ if (conn->log_session != NULL) { wt_session = &conn->log_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); conn->log_session = NULL; } /* Destroy the condition variables now that all threads are stopped */ WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond)); WT_TRET(__wt_rwlock_destroy(session, &conn->log->log_archive_lock)); __wt_spin_destroy(session, &conn->log->log_lock); __wt_spin_destroy(session, &conn->log->log_slot_lock); __wt_spin_destroy(session, &conn->log->log_sync_lock); __wt_spin_destroy(session, &conn->log->log_writelsn_lock); __wt_free(session, conn->log_path); __wt_free(session, conn->log); return (ret); }
/* * __wt_logmgr_open -- * Start the log service threads. */ int __wt_logmgr_open(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; conn = S2C(session); /* If no log thread services are configured, we're done. */ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); /* * Start the log close thread. It is not configurable. * If logging is enabled, this thread runs. */ WT_RET(__wt_open_internal_session( conn, "log-close-server", 0, 0, &conn->log_file_session)); WT_RET(__wt_cond_alloc(conn->log_file_session, "log close server", 0, &conn->log_file_cond)); /* * Start the log file close thread. */ WT_RET(__wt_thread_create(conn->log_file_session, &conn->log_file_tid, __log_file_server, conn->log_file_session)); conn->log_file_tid_set = 1; /* * Start the log write LSN thread. It is not configurable. * If logging is enabled, this thread runs. */ WT_RET(__wt_open_internal_session( conn, "log-wrlsn-server", 0, 0, &conn->log_wrlsn_session)); WT_RET(__wt_cond_alloc(conn->log_wrlsn_session, "log write lsn server", 0, &conn->log_wrlsn_cond)); WT_RET(__wt_thread_create(conn->log_wrlsn_session, &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session)); conn->log_wrlsn_tid_set = 1; /* If no log thread services are configured, we're done. */ if (!FLD_ISSET(conn->log_flags, (WT_CONN_LOG_ARCHIVE | WT_CONN_LOG_PREALLOC))) return (0); /* * If a log server thread exists, the user may have reconfigured * archiving or pre-allocation. Signal the thread. Otherwise the * user wants archiving and/or allocation and we need to start up * the thread. */ if (conn->log_session != NULL) { WT_ASSERT(session, conn->log_cond != NULL); WT_ASSERT(session, conn->log_tid_set != 0); WT_RET(__wt_cond_signal(session, conn->log_cond)); } else { /* The log server gets its own session. */ WT_RET(__wt_open_internal_session( conn, "log-server", 0, 0, &conn->log_session)); WT_RET(__wt_cond_alloc(conn->log_session, "log server", 0, &conn->log_cond)); /* * Start the thread. */ WT_RET(__wt_thread_create(conn->log_session, &conn->log_tid, __log_server, conn->log_session)); conn->log_tid_set = 1; } return (0); }
/* * __log_server -- * The log server thread. */ static WT_THREAD_RET __log_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_SESSION_IMPL *session; int freq_per_sec, signalled; session = arg; conn = S2C(session); log = conn->log; signalled = 0; /* * Set this to the number of times per second we want to force out the * log slot buffer. */ #define WT_FORCE_PER_SECOND 20 freq_per_sec = WT_FORCE_PER_SECOND; /* * The log server thread does a variety of work. It forces out any * buffered log writes. It pre-allocates log files and it performs * log archiving. The reason the wrlsn thread does not force out * the buffered writes is because we want to process and move the * write_lsn forward as quickly as possible. The same reason applies * to why the log file server thread does not force out the writes. * That thread does fsync calls which can take a long time and we * don't want log records sitting in the buffer over the time it * takes to sync out an earlier file. */ while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * Slots depend on future activity. Force out buffered * writes in case we are idle. This cannot be part of the * wrlsn thread because of interaction advancing the write_lsn * and a buffer may need to wait for the write_lsn to advance * in the case of a synchronous buffer. We end up with a hang. */ WT_ERR_BUSY_OK(__wt_log_force_write(session, 0)); /* * We don't want to archive or pre-allocate files as often as * we want to force out log buffers. Only do it once per second * or if the condition was signalled. */ if (--freq_per_sec <= 0 || signalled != 0) { freq_per_sec = WT_FORCE_PER_SECOND; /* * Perform log pre-allocation. */ if (conn->log_prealloc > 0) WT_ERR(__log_prealloc_once(session)); /* * Perform the archive. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) { if (__wt_try_writelock( session, log->log_archive_lock) == 0) { ret = __log_archive_once(session, 0); WT_TRET(__wt_writeunlock( session, log->log_archive_lock)); WT_ERR(ret); } else WT_ERR( __wt_verbose(session, WT_VERB_LOG, "log_archive: Blocked due to open " "log cursor holding archive lock")); } } /* Wait until the next event. */ WT_ERR(__wt_cond_wait_signal(session, conn->log_cond, WT_MILLION / WT_FORCE_PER_SECOND, &signalled)); } if (0) { err: __wt_err(session, ret, "log server error"); } return (WT_THREAD_RET_VALUE); }
/* * __wt_log_wrlsn -- * Process written log slots and attempt to coalesce them if the LSNs * are contiguous. The purpose of this function is to advance the * write_lsn in LSN order after the buffer is written to the log file. */ int __wt_log_wrlsn(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL]; WT_LOGSLOT *coalescing, *slot; WT_LSN save_lsn; size_t written_i; uint32_t i, save_i; conn = S2C(session); log = conn->log; __wt_spin_lock(session, &log->log_writelsn_lock); restart: coalescing = NULL; WT_INIT_LSN(&save_lsn); written_i = 0; i = 0; /* * Walk the array once saving any slots that are in the * WT_LOG_SLOT_WRITTEN state. */ while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; /* * XXX - During debugging I saw slot 0 become orphaned. * I believe it is fixed, but check for now. * This assertion should catch that. */ if (slot->slot_state == 0) WT_ASSERT(session, slot->slot_release_lsn.file >= log->write_lsn.file); if (slot->slot_state != WT_LOG_SLOT_WRITTEN) continue; written[written_i].slot_index = save_i; written[written_i++].lsn = slot->slot_release_lsn; } /* * If we found any written slots process them. We sort them * based on the release LSN, and then look for them in order. */ if (written_i > 0) { WT_INSERTION_SORT(written, written_i, WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT); /* * We know the written array is sorted by LSN. Go * through them either advancing write_lsn or coalesce * contiguous ranges of written slots. */ for (i = 0; i < written_i; i++) { slot = &log->slot_pool[written[i].slot_index]; /* * The log server thread pushes out slots periodically. * Sometimes they are empty slots. If we find an * empty slot, where empty means the start and end LSN * are the same, free it and continue. */ if (__wt_log_cmp(&slot->slot_start_lsn, &slot->slot_release_lsn) == 0 && __wt_log_cmp(&slot->slot_start_lsn, &slot->slot_end_lsn) == 0) { __wt_log_slot_free(session, slot); continue; } if (coalescing != NULL) { /* * If the write_lsn changed, we may be able to * process slots. Try again. */ if (__wt_log_cmp( &log->write_lsn, &save_lsn) != 0) goto restart; if (__wt_log_cmp(&coalescing->slot_end_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; } /* * If we get here we have a slot to coalesce * and free. */ coalescing->slot_last_offset = slot->slot_last_offset; coalescing->slot_end_lsn = slot->slot_end_lsn; WT_STAT_FAST_CONN_INCR( session, log_slot_coalesced); /* * Copy the flag for later closing. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) F_SET(coalescing, WT_SLOT_CLOSEFH); } else { /* * If this written slot is not the next LSN, * try to start coalescing with later slots. * A synchronous write may update write_lsn * so save the last one we saw to check when * coalescing slots. */ save_lsn = log->write_lsn; if (__wt_log_cmp( &log->write_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; } /* * If we get here we have a slot to process. * Advance the LSN and process the slot. */ WT_ASSERT(session, __wt_log_cmp(&written[i].lsn, &slot->slot_release_lsn) == 0); if (slot->slot_start_lsn.offset != slot->slot_last_offset) slot->slot_start_lsn.offset = slot->slot_last_offset; log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; WT_ERR(__wt_cond_signal( session, log->log_write_cond)); WT_STAT_FAST_CONN_INCR(session, log_write_lsn); /* * Signal the close thread if needed. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) WT_ERR(__wt_cond_signal( session, conn->log_file_cond)); } __wt_log_slot_free(session, slot); } } err: __wt_spin_unlock(session, &log->log_writelsn_lock); return (ret); }
/* * __log_file_server -- * The log file server thread. This worker thread manages * log file operations such as closing and syncing. */ static WT_THREAD_RET __log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN close_end_lsn, min_lsn; WT_SESSION_IMPL *session; uint32_t filenum; int locked; session = arg; conn = S2C(session); log = conn->log; locked = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ if ((close_fh = log->log_close_fh) != NULL) { WT_ERR(__wt_log_extract_lognum(session, close_fh->name, &filenum)); /* * We update the close file handle before updating the * close LSN when changing files. It is possible we * could see mismatched settings. If we do, yield * until it is set. This should rarely happen. */ while (log->log_close_lsn.file < filenum) __wt_yield(); if (__wt_log_cmp( &log->write_lsn, &log->log_close_lsn) >= 0) { /* * We've copied the file handle, clear out the * one in the log structure to allow it to be * set again. Copy the LSN before clearing * the file handle. * Use a barrier to make sure the compiler does * not reorder the following two statements. */ close_end_lsn = log->log_close_lsn; WT_FULL_BARRIER(); log->log_close_fh = NULL; /* * Set the close_end_lsn to the LSN immediately * after ours. That is, the beginning of the * next log file. We need to know the LSN * file number of our own close in case earlier * calls are still in progress and the next one * to move the sync_lsn into the next file for * later syncs. */ close_end_lsn.file++; close_end_lsn.offset = 0; WT_ERR(__wt_fsync(session, close_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; WT_ERR(__wt_close(session, &close_fh)); WT_ASSERT(session, __wt_log_cmp( &close_end_lsn, &log->sync_lsn) >= 0); log->sync_lsn = close_end_lsn; WT_ERR(__wt_cond_signal( session, log->log_sync_cond)); locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } } /* * If a later thread asked for a background sync, do it now. */ if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* * We have to wait until the LSN we asked for is * written. If it isn't signal the wrlsn thread * to get it written. */ if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) { WT_ERR(__wt_fsync(session, log->log_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; /* * The sync LSN could have advanced while we * were writing to disk. */ if (__wt_log_cmp( &log->sync_lsn, &min_lsn) <= 0) { log->sync_lsn = min_lsn; WT_ERR(__wt_cond_signal( session, log->log_sync_cond)); } locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } else { WT_ERR(__wt_cond_signal( session, conn->log_wrlsn_cond)); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn * thread a chance to run and try again in * this case. */ __wt_yield(); continue; } } /* Wait until the next event. */ WT_ERR(__wt_cond_wait( session, conn->log_file_cond, WT_MILLION)); } if (0) { err: __wt_err(session, ret, "log close server error"); } if (locked) __wt_spin_unlock(session, &log->log_sync_lock); return (WT_THREAD_RET_VALUE); }
/* * __log_archive_once -- * Perform one iteration of log archiving. Must be called with the * log archive lock held. */ static int __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint32_t lognum, min_lognum; u_int i, locked, logcount; char **logfiles; conn = S2C(session); log = conn->log; logcount = 0; logfiles = NULL; /* * If we're coming from a backup cursor we want the smaller of * the last full log file copied in backup or the checkpoint LSN. * Otherwise we want the minimum of the last log file written to * disk and the checkpoint LSN. */ if (backup_file != 0) min_lognum = WT_MIN(log->ckpt_lsn.file, backup_file); else min_lognum = WT_MIN(log->ckpt_lsn.file, log->sync_lsn.file); WT_RET(__wt_verbose(session, WT_VERB_LOG, "log_archive: archive to log number %" PRIu32, min_lognum)); /* * Main archive code. Get the list of all log files and * remove any earlier than the minimum log number. */ WT_RET(__wt_dirlist(session, conn->log_path, WT_LOG_FILENAME, WT_DIRLIST_INCLUDE, &logfiles, &logcount)); /* * We can only archive files if a hot backup is not in progress or * if we are the backup. */ WT_RET(__wt_readlock(session, conn->hot_backup_lock)); locked = 1; if (conn->hot_backup == 0 || backup_file != 0) { for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum( session, logfiles[i], &lognum)); if (lognum < min_lognum) WT_ERR(__wt_log_remove( session, WT_LOG_FILENAME, lognum)); } } WT_ERR(__wt_readunlock(session, conn->hot_backup_lock)); locked = 0; __wt_log_files_free(session, logfiles, logcount); logfiles = NULL; logcount = 0; /* * Indicate what is our new earliest LSN. It is the start * of the log file containing the last checkpoint. */ log->first_lsn.file = min_lognum; log->first_lsn.offset = 0; if (0) err: __wt_err(session, ret, "log archive server error"); if (locked) WT_TRET(__wt_readunlock(session, conn->hot_backup_lock)); if (logfiles != NULL) __wt_log_files_free(session, logfiles, logcount); return (ret); }
/* * __wt_open -- * Open a file handle. */ int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp) { DWORD dwCreationDisposition; HANDLE filehandle, filehandle_secondary; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *fh, *tfh; uint64_t bucket, hash; int direct_io, f, matched, share_mode; char *path; conn = S2C(session); fh = NULL; path = NULL; filehandle = INVALID_HANDLE_VALUE; filehandle_secondary = INVALID_HANDLE_VALUE; direct_io = 0; hash = __wt_hash_city64(name, strlen(name)); bucket = hash % WT_HASH_ARRAY_SIZE; WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name)); /* Increment the reference count if we already have the file open. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); SLIST_FOREACH(tfh, &conn->fhhash[bucket], l) if (strcmp(name, tfh->name) == 0) { ++tfh->ref; *fhp = tfh; matched = 1; break; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) return (0); /* For directories, create empty file handles with invalid handles */ if (dio_type == WT_FILE_TYPE_DIRECTORY) { goto setupfh; } WT_RET(__wt_filename(session, name, &path)); share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE; /* * Security: * The application may spawn a new process, and we don't want another * process to have access to our file handles. * * TODO: Set tighter file permissions but set bInheritHandle to false * to prevent inheritance */ f = FILE_ATTRIBUTE_NORMAL; dwCreationDisposition = 0; if (ok_create) { dwCreationDisposition = CREATE_NEW; if (exclusive) dwCreationDisposition = CREATE_ALWAYS; } else dwCreationDisposition = OPEN_EXISTING; if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) { f |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; direct_io = 1; } if (dio_type == WT_FILE_TYPE_LOG && FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) { f |= FILE_FLAG_WRITE_THROUGH; } /* Disable read-ahead on trees: it slows down random read workloads. */ if (dio_type == WT_FILE_TYPE_DATA || dio_type == WT_FILE_TYPE_CHECKPOINT) f |= FILE_FLAG_RANDOM_ACCESS; filehandle = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, dwCreationDisposition, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) { if (GetLastError() == ERROR_FILE_EXISTS && ok_create) filehandle = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, OPEN_EXISTING, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) WT_ERR_MSG(session, __wt_errno(), direct_io ? "%s: open failed with direct I/O configured, some " "filesystem types do not support direct I/O" : "%s", path); } /* * Open a second handle to file to support allocation/truncation * concurrently with reads on the file. Writes would also move the file * pointer. */ filehandle_secondary = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, OPEN_EXISTING, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) WT_ERR_MSG(session, __wt_errno(), "open failed for secondary handle: %s", path); setupfh: WT_ERR(__wt_calloc_one(session, &fh)); WT_ERR(__wt_strdup(session, name, &fh->name)); fh->name_hash = hash; fh->filehandle = filehandle; fh->filehandle_secondary = filehandle_secondary; fh->ref = 1; fh->direct_io = direct_io; /* Set the file's size. */ if (dio_type != WT_FILE_TYPE_DIRECTORY) WT_ERR(__wt_filesize(session, fh, &fh->size)); /* Configure file extension. */ if (dio_type == WT_FILE_TYPE_DATA || dio_type == WT_FILE_TYPE_CHECKPOINT) fh->extend_len = conn->data_extend_len; /* Configure fallocate/posix_fallocate calls. */ __wt_fallocate_config(session, fh); /* * Repeat the check for a match, but then link onto the database's list * of files. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); SLIST_FOREACH(tfh, &conn->fhhash[bucket], l) if (strcmp(name, tfh->name) == 0) { ++tfh->ref; *fhp = tfh; matched = 1; break; } if (!matched) { WT_CONN_FILE_INSERT(conn, fh, bucket); WT_STAT_FAST_CONN_INCR(session, file_open); *fhp = fh; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) { err: if (fh != NULL) { __wt_free(session, fh->name); __wt_free(session, fh); } if (filehandle != INVALID_HANDLE_VALUE) (void)CloseHandle(filehandle); if (filehandle_secondary != INVALID_HANDLE_VALUE) (void)CloseHandle(filehandle_secondary); } __wt_free(session, path); return (ret); }
/* * __wt_lsm_free_chunks -- * Try to drop chunks from the tree that are no longer required. */ int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_WORKER_COOKIE cookie; u_int i, skipped; int drop_ret; bool flush_metadata; flush_metadata = false; if (lsm_tree->nold_chunks == 0) return (0); /* * Make sure only a single thread is freeing the old chunk array * at any time. */ if (!__wt_atomic_cas32(&lsm_tree->freeing_old_chunks, 0, 1)) return (0); /* * Take a copy of the current state of the LSM tree and look for chunks * to drop. We do it this way to avoid holding the LSM tree lock while * doing I/O or waiting on the schema lock. * * This is safe because only one thread will be in this function at a * time. Merges may complete concurrently, and the old_chunks array * may be extended, but we shuffle down the pointers each time we free * one to keep the non-NULL slots at the beginning of the array. */ WT_CLEAR(cookie); WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, true)); for (i = skipped = 0; i < cookie.nchunks; i++) { chunk = cookie.chunk_array[i]; WT_ASSERT(session, chunk != NULL); /* Skip the chunk if another worker is using it. */ if (chunk->refcnt > 1) { ++skipped; continue; } /* * Don't remove files if a hot backup is in progress. * * The schema lock protects the set of live files, this check * prevents us from removing a file that hot backup already * knows about. */ if (S2C(session)->hot_backup) break; /* * Drop any bloom filters and chunks we can. Don't try to drop * a chunk if the bloom filter drop fails. * An EBUSY return indicates that a cursor is still open in * the tree - move to the next chunk in that case. * An ENOENT return indicates that the LSM tree metadata was * out of sync with the on disk state. Update the * metadata to match in that case. */ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) { drop_ret = __lsm_drop_file(session, chunk->bloom_uri); if (drop_ret == EBUSY) { ++skipped; continue; } else if (drop_ret != ENOENT) WT_ERR(drop_ret); flush_metadata = true; F_CLR(chunk, WT_LSM_CHUNK_BLOOM); } if (chunk->uri != NULL) { drop_ret = __lsm_drop_file(session, chunk->uri); if (drop_ret == EBUSY) { ++skipped; continue; } else if (drop_ret != ENOENT) WT_ERR(drop_ret); flush_metadata = true; } /* Lock the tree to clear out the old chunk information. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); /* * The chunk we are looking at should be the first one in the * tree that we haven't already skipped over. */ WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk); __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, lsm_tree->old_chunks[skipped]); /* Shuffle down to keep all occupied slots at the beginning. */ if (--lsm_tree->nold_chunks > skipped) { memmove(lsm_tree->old_chunks + skipped, lsm_tree->old_chunks + skipped + 1, (lsm_tree->nold_chunks - skipped) * sizeof(WT_LSM_CHUNK *)); lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL; } WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); /* * Clear the chunk in the cookie so we don't attempt to * decrement the reference count. */ cookie.chunk_array[i] = NULL; } err: /* Flush the metadata unless the system is in panic */ if (flush_metadata && ret != WT_PANIC) { WT_TRET(__wt_lsm_tree_writelock(session, lsm_tree)); WT_TRET(__wt_lsm_meta_write(session, lsm_tree)); WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); } __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); lsm_tree->freeing_old_chunks = 0; /* Returning non-zero means there is no work to do. */ if (!flush_metadata) WT_TRET(WT_NOTFOUND); return (ret); }
/* * __wt_txn_recover -- * Run recovery. */ int __wt_txn_recover(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *metac; WT_DECL_RET; WT_RECOVERY r; struct WT_RECOVERY_FILE *metafile; char *config; bool eviction_started, needs_rec, was_backup; conn = S2C(session); WT_CLEAR(r); WT_INIT_LSN(&r.ckpt_lsn); eviction_started = false; was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP); /* We need a real session for recovery. */ WT_RET(__wt_open_internal_session(conn, "txn-recover", false, WT_SESSION_NO_LOGGING, &session)); r.session = session; F_SET(conn, WT_CONN_RECOVERING); WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac)); metafile = &r.files[WT_METAFILE_ID]; metafile->c = metac; /* * If no log was found (including if logging is disabled), or if the * last checkpoint was done with logging disabled, recovery should not * run. Scan the metadata to figure out the largest file ID. */ if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) { WT_ERR(__recovery_file_scan(&r)); conn->next_file_id = r.max_fileid; goto done; } /* * First, do a pass through the log to recover the metadata, and * establish the last checkpoint LSN. Skip this when opening a hot * backup: we already have the correct metadata in that case. */ if (!was_backup) { r.metadata_only = true; /* * If this is a read-only connection, check if the checkpoint * LSN in the metadata file is up to date, indicating a clean * shutdown. */ if (F_ISSET(conn, WT_CONN_READONLY)) { WT_ERR(__wt_log_needs_recovery( session, &metafile->ckpt_lsn, &needs_rec)); if (needs_rec) WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery"); } if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); else { /* * Start at the last checkpoint LSN referenced in the * metadata. If we see the end of a checkpoint while * scanning, we will change the full scan to start from * there. */ r.ckpt_lsn = metafile->ckpt_lsn; ret = __wt_log_scan(session, &metafile->ckpt_lsn, 0, __txn_log_recover, &r); if (ret == ENOENT) ret = 0; WT_ERR(ret); } } /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); /* * We no longer need the metadata cursor: close it to avoid pinning any * resources that could block eviction during recovery. */ r.files[0].c = NULL; WT_ERR(metac->close(metac)); /* * Now, recover all the files apart from the metadata. * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = false; WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, "Main recovery loop: starting at %" PRIu32 "/%" PRIu32, r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset)); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); /* * Check if the database was shut down cleanly. If not * return an error if the user does not want automatic * recovery. */ if (needs_rec && (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) || F_ISSET(conn, WT_CONN_READONLY))) { if (F_ISSET(conn, WT_CONN_READONLY)) WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery"); WT_ERR(WT_RUN_RECOVERY); } if (F_ISSET(conn, WT_CONN_READONLY)) goto done; /* * Recovery can touch more data than fits in cache, so it relies on * regular eviction to manage paging. Start eviction threads for * recovery without LAS cursors. */ WT_ERR(__wt_evict_create(session)); eviction_started = true; /* * Always run recovery even if it was a clean shutdown only if * this is not a read-only connection. * We can consider skipping it in the future. */ if (WT_IS_INIT_LSN(&r.ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); else { ret = __wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r); if (ret == ENOENT) ret = 0; WT_ERR(ret); } conn->next_file_id = r.max_fileid; /* * If recovery ran successfully forcibly log a checkpoint so the next * open is fast and keep the metadata up to date with the checkpoint * LSN and archiving. */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); done: FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE); err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); if (ret != 0) __wt_err(session, ret, "Recovery failed"); /* * Destroy the eviction threads that were started in support of * recovery. They will be restarted once the lookaside table is * created. */ if (eviction_started) WT_TRET(__wt_evict_destroy(session)); WT_TRET(session->iface.close(&session->iface, NULL)); F_CLR(conn, WT_CONN_RECOVERING); return (ret); }
/* * __backup_start -- * Start a backup. */ static int __backup_start( WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; int exist, log_only, target_list; conn = S2C(session); cb->next = 0; cb->list = NULL; /* * Single thread hot backups: we're holding the schema lock, so we * know we'll serialize with other attempts to start a hot backup. */ if (conn->hot_backup) WT_RET_MSG( session, EINVAL, "there is already a backup cursor open"); /* * The hot backup copy is done outside of WiredTiger, which means file * blocks can't be freed and re-allocated until the backup completes. * The checkpoint code checks the backup flag, and if a backup cursor * is open checkpoints aren't discarded. We release the lock as soon * as we've set the flag, we don't want to block checkpoints, we just * want to make sure no checkpoints are deleted. The checkpoint code * holds the lock until it's finished the checkpoint, otherwise we * could start a hot backup that would race with an already-started * checkpoint. */ __wt_spin_lock(session, &conn->hot_backup_lock); conn->hot_backup = 1; __wt_spin_unlock(session, &conn->hot_backup_lock); /* Create the hot backup file. */ WT_ERR(__backup_file_create(session, cb, 0)); /* Add log files if logging is enabled. */ /* * If a list of targets was specified, work our way through them. * Else, generate a list of all database objects. * * Include log files if doing a full backup, and copy them before * copying data files to avoid rolling the metadata forward across * a checkpoint that completes during the backup. */ target_list = 0; WT_ERR(__backup_uri(session, cb, cfg, &target_list, &log_only)); if (!target_list) { WT_ERR(__backup_log_append(session, cb, 1)); WT_ERR(__backup_all(session, cb)); } /* Add the hot backup and standard WiredTiger files to the list. */ if (log_only) { /* * Close any hot backup file. * We're about to open the incremental backup file. */ if (cb->bfp != NULL) { WT_TRET(fclose(cb->bfp) == 0 ? 0 : __wt_errno()); cb->bfp = NULL; } WT_ERR(__backup_file_create(session, cb, log_only)); WT_ERR(__backup_list_append( session, cb, WT_INCREMENTAL_BACKUP)); } else { WT_ERR(__backup_list_append( session, cb, WT_METADATA_BACKUP)); WT_ERR(__wt_exist(session, WT_BASECONFIG, &exist)); if (exist) WT_ERR(__backup_list_append( session, cb, WT_BASECONFIG)); WT_ERR(__wt_exist(session, WT_USERCONFIG, &exist)); if (exist) WT_ERR(__backup_list_append( session, cb, WT_USERCONFIG)); WT_ERR(__backup_list_append(session, cb, WT_WIREDTIGER)); } err: /* Close the hot backup file. */ if (cb->bfp != NULL) { WT_TRET(fclose(cb->bfp) == 0 ? 0 : __wt_errno()); cb->bfp = NULL; } if (ret != 0) { WT_TRET(__backup_cleanup_handles(session, cb)); WT_TRET(__backup_stop(session)); } return (ret); }