/* * __wt_clsm_open_bulk -- * WT_SESSION->open_cursor method for LSM bulk cursors. */ int __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]) { WT_CURSOR *cursor, *bulk_cursor; WT_DECL_RET; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; bulk_cursor = NULL; cursor = &clsm->iface; lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; F_SET(clsm, WT_CLSM_BULK); /* Bulk cursors are limited to insert and close. */ __wt_cursor_set_notsup(cursor); cursor->insert = __clsm_insert_bulk; cursor->close = __clsm_close_bulk; /* * Setup the first chunk in the tree. This is the only time we switch * without using the LSM worker threads, it's safe to do here since * we have an exclusive lock on the LSM tree. We need to do this * switch inline, since switch needs a schema lock and online index * creation opens a bulk cursor while holding the schema lock. */ WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_RET(ret); /* * Open a bulk cursor on the first chunk, it's not a regular LSM chunk * cursor, but use the standard storage locations. Allocate the space * for a bloom filter - it makes cleanup simpler. Cleaned up by * cursor close on error. */ WT_RET(__wt_calloc_one(session, &clsm->blooms)); clsm->bloom_alloc = 1; WT_RET(__wt_calloc_one(session, &clsm->cursors)); clsm->cursor_alloc = 1; clsm->nchunks = 1; /* * Open a bulk cursor on the first chunk in the tree - take a read * lock on the LSM tree while we are opening the chunk, to ensure * that the first chunk has been fully created before we succeed. * Pass through the application config to ensure the tree is open * for bulk access. */ WT_RET(__wt_open_cursor(session, lsm_tree->chunk[0]->uri, &clsm->iface, cfg, &bulk_cursor)); clsm->cursors[0] = bulk_cursor; /* LSM cursors are always raw */ F_SET(bulk_cursor, WT_CURSTD_RAW); return (0); }
/* * __wt_clsm_open_bulk -- * WT_SESSION->open_cursor method for LSM bulk cursors. */ int __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]) { WT_CURSOR *cursor, *bulk_cursor; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; bulk_cursor = NULL; cursor = &clsm->iface; lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; F_SET(clsm, WT_CLSM_BULK); /* Bulk cursors are limited to insert and close. */ __wt_cursor_set_notsup(cursor); cursor->insert = __clsm_insert_bulk; cursor->close = __clsm_close_bulk; /* Setup the first chunk in the tree. */ WT_RET(__wt_clsm_request_switch(clsm)); WT_RET(__wt_clsm_await_switch(clsm)); /* * Grab and release the LSM tree lock to ensure that the first chunk * has been fully created before proceeding. We have the LSM tree * open exclusive, so that saves us from needing the lock generally. */ WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree)); /* * Open a bulk cursor on the first chunk, it's not a regular LSM chunk * cursor, but use the standard storage locations. Allocate the space * for a bloom filter - it makes cleanup simpler. Cleaned up by * cursor close on error. */ WT_RET(__wt_calloc_one(session, &clsm->blooms)); clsm->bloom_alloc = 1; WT_RET(__wt_calloc_one(session, &clsm->cursors)); clsm->cursor_alloc = 1; clsm->nchunks = 1; /* * Open a bulk cursor on the first chunk in the tree - take a read * lock on the LSM tree while we are opening the chunk, to ensure * that the first chunk has been fully created before we succeed. * Pass through the application config to ensure the tree is open * for bulk access. */ WT_RET(__wt_open_cursor(session, lsm_tree->chunk[0]->uri, &clsm->iface, cfg, &bulk_cursor)); clsm->cursors[0] = bulk_cursor; /* LSM cursors are always raw */ F_SET(bulk_cursor, WT_CURSTD_RAW); return (0); }
/*创建一个connection evict cache*/ int __wt_cache_create(WT_SESSION_IMPL* session, const char* cfg[]) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; conn = S2C(session); WT_RET(__wt_calloc_one(session, &conn->cache)); cache = conn->cache; /*对cache进行配置*/ WT_RET(__wt_cache_config(session, 0, cfg)); if (cache->eviction_target >= cache->eviction_trigger) WT_ERR_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger"); /*创建evict cond信号量*/ WT_ERR(__wt_cond_alloc(session, "cache eviction server", 0, &cache->evict_cond)); WT_ERR(__wt_cond_alloc(session, "eviction waiters", 0, &cache->evict_waiter_cond)); WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction")); WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk")); /* Allocate the LRU eviction queue. */ cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict)); /*初始化cache stat统计模块*/ __wt_cache_stats_update(session); return 0; err: WT_RET(__wt_cache_destroy(session)); return ret; }
/* * __wt_os_posix -- * Initialize a POSIX configuration. */ int __wt_os_posix(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_FILE_SYSTEM *file_system; conn = S2C(session); WT_RET(__wt_calloc_one(session, &file_system)); /* Initialize the POSIX jump table. */ file_system->fs_directory_list = __wt_posix_directory_list; file_system->fs_directory_list_free = __wt_posix_directory_list_free; file_system->fs_exist = __posix_fs_exist; file_system->fs_open_file = __posix_open_file; file_system->fs_remove = __posix_fs_remove; file_system->fs_rename = __posix_fs_rename; file_system->fs_size = __posix_fs_size; file_system->terminate = __posix_terminate; /* Switch it into place. */ conn->file_system = file_system; return (0); }
/* * __wt_cond_alloc -- * Allocate and initialize a condition variable. */ int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, int is_signalled, WT_CONDVAR **condp) { WT_CONDVAR *cond; WT_DECL_RET; /* * !!! * This function MUST handle a NULL session handle. */ WT_RET(__wt_calloc_one(session, &cond)); WT_ERR(pthread_mutex_init(&cond->mtx, NULL)); /* Initialize the condition variable to permit self-blocking. */ WT_ERR(pthread_cond_init(&cond->cond, NULL)); cond->name = name; cond->waiters = is_signalled ? -1 : 0; *condp = cond; return (0); err: __wt_free(session, cond); return (ret); }
/* * __bloom_init -- * Allocate a WT_BLOOM handle. */ static int __bloom_init(WT_SESSION_IMPL *session, const char *uri, const char *config, WT_BLOOM **bloomp) { WT_BLOOM *bloom; WT_DECL_RET; size_t len; *bloomp = NULL; WT_RET(__wt_calloc_one(session, &bloom)); WT_ERR(__wt_strdup(session, uri, &bloom->uri)); len = strlen(WT_BLOOM_TABLE_CONFIG) + 2; if (config != NULL) len += strlen(config); WT_ERR(__wt_calloc_def(session, len, &bloom->config)); /* Add the standard config at the end, so it overrides user settings. */ (void)snprintf(bloom->config, len, "%s,%s", config == NULL ? "" : config, WT_BLOOM_TABLE_CONFIG); bloom->session = session; *bloomp = bloom; return (0); err: __wt_free(session, bloom->uri); __wt_free(session, bloom->config); __wt_free(session, bloom->bitstring); __wt_free(session, bloom); return (ret); }
/* * __wt_dlopen -- * Open a dynamic library. */ int __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp) { WT_DECL_RET; WT_DLH *dlh; WT_RET(__wt_calloc_one(session, &dlh)); WT_ERR(__wt_strdup(session, path, &dlh->name)); /* NULL means load from the current binary */ if (path == NULL) { ret = GetModuleHandleExA(0, NULL, &dlh->handle); if (ret == FALSE) WT_ERR_MSG(session, __wt_errno(), "GetModuleHandleEx(%s): %s", path, 0); } else { // TODO: load dll here DebugBreak(); } /* Windows returns 0 on failure, WT expects 0 on success */ ret = !ret; *dlhp = dlh; if (0) { err: __wt_free(session, dlh->name); __wt_free(session, dlh); } return (ret); }
/* * wiredtiger_config_parser_open -- * Create a configuration parser. */ int wiredtiger_config_parser_open(WT_SESSION *wt_session, const char *config, size_t len, WT_CONFIG_PARSER **config_parserp) { static const WT_CONFIG_PARSER stds = { __config_parser_close, __config_parser_next, __config_parser_get }; WT_CONFIG_ITEM config_item = { config, len, 0, WT_CONFIG_ITEM_STRING }; WT_CONFIG_PARSER_IMPL *config_parser; WT_SESSION_IMPL *session; *config_parserp = NULL; session = (WT_SESSION_IMPL *)wt_session; WT_RET(__wt_calloc_one(session, &config_parser)); config_parser->iface = stds; config_parser->session = session; /* * Setup a WT_CONFIG_ITEM to be used for get calls and a WT_CONFIG * structure for iterations through the configuration string. */ memcpy(&config_parser->config_item, &config_item, sizeof(config_item)); __wt_config_initn(session, &config_parser->config, config, len); *config_parserp = (WT_CONFIG_PARSER *)config_parser; return (0); }
/* * __wt_dlopen -- * Open a dynamic library. */ int __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp) { DWORD windows_error; WT_DECL_RET; WT_DLH *dlh; WT_RET(__wt_calloc_one(session, &dlh)); WT_ERR(__wt_strdup(session, path, &dlh->name)); WT_ERR(__wt_strdup(session, path == NULL ? "local" : path, &dlh->name)); /* NULL means load from the current binary */ if (path == NULL) { if (GetModuleHandleExA( 0, NULL, (HMODULE *)&dlh->handle) == FALSE) { windows_error = __wt_getlasterror(); __wt_errx(session, "GetModuleHandleEx: %s: %s", path, __wt_formatmessage(session, windows_error)); WT_ERR(__wt_map_windows_error(windows_error)); } } else { // TODO: load dll here DebugBreak(); } *dlhp = dlh; if (0) { err: __wt_free(session, dlh->name); __wt_free(session, dlh); } return (ret); }
/* * __wt_page_modify_alloc -- * Allocate a page's modification structure. */ int __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_CONNECTION_IMPL *conn; WT_PAGE_MODIFY *modify; conn = S2C(session); WT_RET(__wt_calloc_one(session, &modify)); /* * Select a spinlock for the page; let the barrier immediately below * keep things from racing too badly. */ modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS; /* * Multiple threads of control may be searching and deciding to modify * a page. If our modify structure is used, update the page's memory * footprint, else discard the modify structure, another thread did the * work. */ if (__wt_atomic_cas_ptr(&page->modify, NULL, modify)) __wt_cache_page_inmem_incr(session, page, sizeof(*modify)); else __wt_free(session, modify); return (0); }
/* * __wt_curbackup_open -- * WT_SESSION->open_cursor method for the backup cursor type. */ int __wt_curbackup_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value_notsup, /* get-value */ __wt_cursor_set_key_notsup, /* set-key */ __wt_cursor_set_value_notsup, /* set-value */ __wt_cursor_compare_notsup, /* compare */ __wt_cursor_equals_notsup, /* equals */ __curbackup_next, /* next */ __wt_cursor_notsup, /* prev */ __curbackup_reset, /* reset */ __wt_cursor_notsup, /* search */ __wt_cursor_search_near_notsup, /* search-near */ __wt_cursor_notsup, /* insert */ __wt_cursor_modify_notsup, /* modify */ __wt_cursor_notsup, /* update */ __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ __curbackup_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_BACKUP *cb; WT_DECL_RET; WT_STATIC_ASSERT(offsetof(WT_CURSOR_BACKUP, iface) == 0); cb = NULL; WT_RET(__wt_calloc_one(session, &cb)); cursor = &cb->iface; *cursor = iface; cursor->session = &session->iface; session->bkp_cursor = cb; cursor->key_format = "S"; /* Return the file names as the key. */ cursor->value_format = ""; /* No value. */ /* * Start the backup and fill in the cursor's list. Acquire the schema * lock, we need a consistent view when creating a copy. */ WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, ret = __backup_start(session, cb, cfg))); WT_ERR(ret); WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); if (0) { err: WT_TRET(__curbackup_close(cursor)); *cursorp = NULL; } return (ret); }
/* * __wt_logmgr_create -- * Initialize the log subsystem (before running recovery). */ int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_LOG *log; bool run; conn = S2C(session); /* Handle configuration. */ WT_RET(__logmgr_config(session, cfg, &run, false)); /* If logging is not configured, we're done. */ if (!run) return (0); FLD_SET(conn->log_flags, WT_CONN_LOG_ENABLED); /* * Logging is on, allocate the WT_LOG structure and open the log file. */ WT_RET(__wt_calloc_one(session, &conn->log)); log = conn->log; WT_RET(__wt_spin_init(session, &log->log_lock, "log")); WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot")); WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync")); WT_RET(__wt_spin_init(session, &log->log_writelsn_lock, "log write LSN")); WT_RET(__wt_rwlock_alloc(session, &log->log_archive_lock, "log archive lock")); if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG)) log->allocsize = WT_MAX((uint32_t)conn->buffer_alignment, WT_LOG_ALIGN); else log->allocsize = WT_LOG_ALIGN; WT_INIT_LSN(&log->alloc_lsn); WT_INIT_LSN(&log->ckpt_lsn); WT_INIT_LSN(&log->first_lsn); WT_INIT_LSN(&log->sync_lsn); /* * We only use file numbers for directory sync, so this needs to * initialized to zero. */ WT_ZERO_LSN(&log->sync_dir_lsn); WT_INIT_LSN(&log->trunc_lsn); WT_INIT_LSN(&log->write_lsn); WT_INIT_LSN(&log->write_start_lsn); log->fileid = 0; WT_RET(__wt_cond_alloc( session, "log sync", false, &log->log_sync_cond)); WT_RET(__wt_cond_alloc( session, "log write", false, &log->log_write_cond)); WT_RET(__wt_log_open(session)); WT_RET(__wt_log_slot_init(session)); return (0); }
/* * __wt_cache_create -- * Create the underlying cache. */ int __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; conn = S2C(session); WT_ASSERT(session, conn->cache == NULL); WT_RET(__wt_calloc_one(session, &conn->cache)); cache = conn->cache; /* Use a common routine for run-time configuration options. */ WT_RET(__wt_cache_config(session, false, cfg)); /* * The lowest possible page read-generation has a special meaning, it * marks a page for forcible eviction; don't let it happen by accident. */ cache->read_gen = WT_READGEN_START_VALUE; /* * The target size must be lower than the trigger size or we will never * get any work done. */ if (cache->eviction_target >= cache->eviction_trigger) WT_ERR_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger"); WT_ERR(__wt_cond_auto_alloc(session, "cache eviction server", false, 10000, WT_MILLION, &cache->evict_cond)); WT_ERR(__wt_cond_alloc(session, "eviction waiters", false, &cache->evict_waiter_cond)); WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction")); WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk")); /* Allocate the LRU eviction queue. */ cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict_queue)); /* * We get/set some values in the cache statistics (rather than have * two copies), configure them. */ __wt_cache_stats_update(session); return (0); err: WT_RET(__wt_cache_destroy(session)); return (ret); }
/* * __curjoin_entry_iter_init -- * Initialize an iteration for the index managed by a join entry. * */ static int __curjoin_entry_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_CURSOR_JOIN_ENTRY *entry, WT_CURSOR_JOIN_ITER **iterp) { WT_CURSOR *newcur; WT_CURSOR *to_dup; WT_DECL_RET; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; const char *def_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), NULL }; const char *uri, **config; char *uribuf; WT_CURSOR_JOIN_ITER *iter; size_t size; iter = NULL; uribuf = NULL; to_dup = entry->ends[0].cursor; uri = to_dup->uri; if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW)) config = &raw_cfg[0]; else config = &def_cfg[0]; if (cjoin->projection != NULL) { size = strlen(uri) + strlen(cjoin->projection) + 1; WT_ERR(__wt_calloc(session, size, 1, &uribuf)); snprintf(uribuf, size, "%s%s", uri, cjoin->projection); uri = uribuf; } WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)cjoin, config, &newcur)); WT_ERR(__wt_cursor_dup_position(to_dup, newcur)); WT_ERR(__wt_calloc_one(session, &iter)); iter->cjoin = cjoin; iter->session = session; iter->entry = entry; iter->cursor = newcur; iter->advance = false; *iterp = iter; if (0) { err: __wt_free(session, iter); } __wt_free(session, uribuf); return (ret); }
/* * __wt_rwlock_alloc -- * Allocate and initialize a read/write lock. */ int __wt_rwlock_alloc( WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name) { WT_RWLOCK *rwlock; WT_RET(__wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name)); WT_RET(__wt_calloc_one(session, &rwlock)); rwlock->name = name; *rwlockp = rwlock; return (0); }
/* * __curjoin_iter_init -- * Initialize an iteration for the index managed by a join entry. */ static int __curjoin_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_CURSOR_JOIN_ITER **iterp) { WT_CURSOR_JOIN_ITER *iter; *iterp = NULL; WT_RET(__wt_calloc_one(session, iterp)); iter = *iterp; iter->cjoin = cjoin; iter->session = session; cjoin->iter = iter; WT_RET(__curjoin_iter_set_entry(iter, 0)); return (0); }
/* * __wt_cache_create -- * Create the underlying cache. */ int __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; conn = S2C(session); WT_ASSERT(session, conn->cache == NULL); WT_RET(__wt_calloc_one(session, &conn->cache)); cache = conn->cache; /* Use a common routine for run-time configuration options. */ WT_RET(__wt_cache_config(session, 0, cfg)); /* * The target size must be lower than the trigger size or we will never * get any work done. */ if (cache->eviction_target >= cache->eviction_trigger) WT_ERR_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger"); WT_ERR(__wt_cond_alloc(session, "cache eviction server", 0, &cache->evict_cond)); WT_ERR(__wt_cond_alloc(session, "eviction waiters", 0, &cache->evict_waiter_cond)); WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction")); WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk")); /* Allocate the LRU eviction queue. */ cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict)); /* * We get/set some values in the cache statistics (rather than have * two copies), configure them. */ __wt_cache_stats_update(session); return (0); err: WT_RET(__wt_cache_destroy(session)); return (ret); }
/* * __wt_cond_alloc -- * Allocate and initialize a condition variable. */ int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) { WT_CONDVAR *cond; WT_RET(__wt_calloc_one(session, &cond)); InitializeCriticalSection(&cond->mtx); /* Initialize the condition variable to permit self-blocking. */ InitializeConditionVariable(&cond->cond); cond->name = name; cond->waiters = 0; *condp = cond; return (0); }
/* * __wt_curconfig_open -- * WT_SESSION->open_cursor method for config cursors. */ int __wt_curconfig_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __wt_cursor_compare_notsup, /* compare */ __wt_cursor_equals_notsup, /* equals */ __wt_cursor_notsup, /* next */ __wt_cursor_notsup, /* prev */ __wt_cursor_noop, /* reset */ __wt_cursor_notsup, /* search */ __wt_cursor_search_near_notsup, /* search-near */ __wt_cursor_notsup, /* insert */ __wt_cursor_notsup, /* update */ __wt_cursor_notsup, /* remove */ __wt_cursor_reconfigure_notsup, /* reconfigure */ __curconfig_close); WT_CURSOR_CONFIG *cconfig; WT_CURSOR *cursor; WT_DECL_RET; WT_STATIC_ASSERT(offsetof(WT_CURSOR_CONFIG, iface) == 0); WT_UNUSED(uri); WT_RET(__wt_calloc_one(session, &cconfig)); cursor = &cconfig->iface; *cursor = iface; cursor->session = &session->iface; cursor->key_format = cursor->value_format = "S"; /* __wt_cursor_init is last so we don't have to clean up on error. */ WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); if (0) { err: __wt_free(session, cconfig); } return (ret); }
/* * wiredtiger_pack_start -- * Open a stream for packing. */ int wiredtiger_pack_start(WT_SESSION *wt_session, const char *format, void *buffer, size_t len, WT_PACK_STREAM **psp) { WT_DECL_RET; WT_PACK_STREAM *ps; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; WT_RET(__wt_calloc_one(session, &ps)); WT_ERR(__pack_init(session, &ps->pack, format)); ps->p = ps->start = buffer; ps->end = ps->p + len; *psp = ps; if (0) { err: (void)wiredtiger_pack_close(ps, NULL); } return (ret); }
/* * __wt_dlopen -- * Open a dynamic library. */ int __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp) { WT_DECL_RET; WT_DLH *dlh; WT_RET(__wt_calloc_one(session, &dlh)); WT_ERR(__wt_strdup(session, path, &dlh->name)); if ((dlh->handle = dlopen(path, RTLD_LAZY)) == NULL) WT_ERR_MSG( session, __wt_errno(), "dlopen(%s): %s", path, dlerror()); *dlhp = dlh; if (0) { err: __wt_free(session, dlh->name); __wt_free(session, dlh); } return (ret); }
/* * __wt_fopen -- * Open a stream handle. */ int __wt_fopen(WT_SESSION_IMPL *session, const char *name, uint32_t open_flags, uint32_t flags, WT_FSTREAM **fstrp) { WT_DECL_RET; WT_FH *fh; WT_FSTREAM *fstr; *fstrp = NULL; fstr = NULL; WT_RET(__wt_open( session, name, WT_OPEN_FILE_TYPE_REGULAR, open_flags, &fh)); WT_ERR(__wt_calloc_one(session, &fstr)); fstr->fh = fh; fstr->name = fh->name; fstr->flags = flags; fstr->close = __fstream_close; WT_ERR(__wt_filesize(session, fh, &fstr->size)); if (LF_ISSET(WT_STREAM_APPEND)) fstr->off = fstr->size; if (LF_ISSET(WT_STREAM_APPEND | WT_STREAM_WRITE)) { fstr->fstr_flush = __fstream_flush; fstr->fstr_getline = __fstream_getline_notsup; fstr->fstr_printf = __fstream_printf; } else { WT_ASSERT(session, LF_ISSET(WT_STREAM_READ)); fstr->fstr_flush = __fstream_flush_notsup; fstr->fstr_getline = __fstream_getline; fstr->fstr_printf = __fstream_printf_notsup; } *fstrp = fstr; return (0); err: WT_TRET(__wt_close(session, &fh)); __wt_free(session, fstr); return (ret); }
/* * __wt_block_manager_open -- * Open a file. */ int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], int forced_salvage, int readonly, uint32_t allocsize, WT_BM **bmp) { WT_BM *bm; WT_DECL_RET; *bmp = NULL; WT_RET(__wt_calloc_one(session, &bm)); __bm_method_set(bm, 0); WT_ERR(__wt_block_open(session, filename, cfg, forced_salvage, readonly, allocsize, &bm->block)); *bmp = bm; return (0); err: WT_TRET(bm->close(bm, session)); return (ret); }
/* * __wt_cond_alloc -- * Allocate and initialize a condition variable. */ int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, bool is_signalled, WT_CONDVAR **condp) { WT_CONDVAR *cond; /* * !!! * This function MUST handle a NULL session handle. */ WT_RET(__wt_calloc_one(session, &cond)); InitializeCriticalSection(&cond->mtx); /* Initialize the condition variable to permit self-blocking. */ InitializeConditionVariable(&cond->cond); cond->name = name; cond->waiters = is_signalled ? -1 : 0; *condp = cond; return (0); }
/* * __wt_open -- * Open a file handle. */ int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp) { DWORD dwCreationDisposition; HANDLE filehandle, filehandle_secondary; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *fh, *tfh; uint64_t bucket, hash; int direct_io, f, matched, share_mode; char *path; conn = S2C(session); fh = NULL; path = NULL; filehandle = INVALID_HANDLE_VALUE; filehandle_secondary = INVALID_HANDLE_VALUE; direct_io = 0; hash = __wt_hash_city64(name, strlen(name)); bucket = hash % WT_HASH_ARRAY_SIZE; WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name)); /* Increment the reference count if we already have the file open. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); SLIST_FOREACH(tfh, &conn->fhhash[bucket], l) if (strcmp(name, tfh->name) == 0) { ++tfh->ref; *fhp = tfh; matched = 1; break; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) return (0); /* For directories, create empty file handles with invalid handles */ if (dio_type == WT_FILE_TYPE_DIRECTORY) { goto setupfh; } WT_RET(__wt_filename(session, name, &path)); share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE; /* * Security: * The application may spawn a new process, and we don't want another * process to have access to our file handles. * * TODO: Set tighter file permissions but set bInheritHandle to false * to prevent inheritance */ f = FILE_ATTRIBUTE_NORMAL; dwCreationDisposition = 0; if (ok_create) { dwCreationDisposition = CREATE_NEW; if (exclusive) dwCreationDisposition = CREATE_ALWAYS; } else dwCreationDisposition = OPEN_EXISTING; if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) { f |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; direct_io = 1; } if (dio_type == WT_FILE_TYPE_LOG && FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) { f |= FILE_FLAG_WRITE_THROUGH; } /* Disable read-ahead on trees: it slows down random read workloads. */ if (dio_type == WT_FILE_TYPE_DATA || dio_type == WT_FILE_TYPE_CHECKPOINT) f |= FILE_FLAG_RANDOM_ACCESS; filehandle = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, dwCreationDisposition, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) { if (GetLastError() == ERROR_FILE_EXISTS && ok_create) filehandle = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, OPEN_EXISTING, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) WT_ERR_MSG(session, __wt_errno(), direct_io ? "%s: open failed with direct I/O configured, some " "filesystem types do not support direct I/O" : "%s", path); } /* * Open a second handle to file to support allocation/truncation * concurrently with reads on the file. Writes would also move the file * pointer. */ filehandle_secondary = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, OPEN_EXISTING, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) WT_ERR_MSG(session, __wt_errno(), "open failed for secondary handle: %s", path); setupfh: WT_ERR(__wt_calloc_one(session, &fh)); WT_ERR(__wt_strdup(session, name, &fh->name)); fh->name_hash = hash; fh->filehandle = filehandle; fh->filehandle_secondary = filehandle_secondary; fh->ref = 1; fh->direct_io = direct_io; /* Set the file's size. */ if (dio_type != WT_FILE_TYPE_DIRECTORY) WT_ERR(__wt_filesize(session, fh, &fh->size)); /* Configure file extension. */ if (dio_type == WT_FILE_TYPE_DATA || dio_type == WT_FILE_TYPE_CHECKPOINT) fh->extend_len = conn->data_extend_len; /* Configure fallocate/posix_fallocate calls. */ __wt_fallocate_config(session, fh); /* * Repeat the check for a match, but then link onto the database's list * of files. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); SLIST_FOREACH(tfh, &conn->fhhash[bucket], l) if (strcmp(name, tfh->name) == 0) { ++tfh->ref; *fhp = tfh; matched = 1; break; } if (!matched) { WT_CONN_FILE_INSERT(conn, fh, bucket); WT_STAT_FAST_CONN_INCR(session, file_open); *fhp = fh; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) { err: if (fh != NULL) { __wt_free(session, fh->name); __wt_free(session, fh); } if (filehandle != INVALID_HANDLE_VALUE) (void)CloseHandle(filehandle); if (filehandle_secondary != INVALID_HANDLE_VALUE) (void)CloseHandle(filehandle_secondary); } __wt_free(session, path); return (ret); }
/* * __schema_open_table -- * Open a named table (internal version). */ static int __schema_open_table(WT_SESSION_IMPL *session, const char *name, size_t namelen, bool ok_incomplete, WT_TABLE **tablep) { WT_CONFIG cparser; WT_CONFIG_ITEM ckey, cval; WT_CURSOR *cursor; WT_DECL_ITEM(buf); WT_DECL_RET; WT_TABLE *table; const char *tconfig; char *tablename; cursor = NULL; table = NULL; tablename = NULL; WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE)); WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "table:%.*s", (int)namelen, name)); WT_ERR(__wt_strndup(session, buf->data, buf->size, &tablename)); WT_ERR(__wt_metadata_cursor(session, &cursor)); cursor->set_key(cursor, tablename); WT_ERR(cursor->search(cursor)); WT_ERR(cursor->get_value(cursor, &tconfig)); WT_ERR(__wt_calloc_one(session, &table)); table->name = tablename; tablename = NULL; table->name_hash = __wt_hash_city64(name, namelen); WT_ERR(__wt_config_getones(session, tconfig, "columns", &cval)); WT_ERR(__wt_config_getones(session, tconfig, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->key_format)); WT_ERR(__wt_config_getones(session, tconfig, "value_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->value_format)); WT_ERR(__wt_strdup(session, tconfig, &table->config)); /* Point to some items in the copy to save re-parsing. */ WT_ERR(__wt_config_getones(session, table->config, "columns", &table->colconf)); /* * Count the number of columns: tables are "simple" if the columns * are not named. */ WT_ERR(__wt_config_subinit(session, &cparser, &table->colconf)); table->is_simple = true; while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0) table->is_simple = false; if (ret != WT_NOTFOUND) goto err; /* Check that the columns match the key and value formats. */ if (!table->is_simple) WT_ERR(__wt_schema_colcheck(session, table->key_format, table->value_format, &table->colconf, &table->nkey_columns, NULL)); WT_ERR(__wt_config_getones(session, table->config, "colgroups", &table->cgconf)); /* Count the number of column groups. */ WT_ERR(__wt_config_subinit(session, &cparser, &table->cgconf)); table->ncolgroups = 0; while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0) ++table->ncolgroups; if (ret != WT_NOTFOUND) goto err; if (table->ncolgroups > 0 && table->is_simple) WT_ERR_MSG(session, EINVAL, "%s requires a table with named columns", tablename); WT_ERR(__wt_calloc_def(session, WT_COLGROUPS(table), &table->cgroups)); WT_ERR(__wt_schema_open_colgroups(session, table)); if (!ok_incomplete && !table->cg_complete) WT_ERR_MSG(session, EINVAL, "'%s' cannot be used " "until all column groups are created", table->name); /* Copy the schema generation into the new table. */ table->schema_gen = S2C(session)->schema_gen; *tablep = table; if (0) { err: WT_TRET(__wt_schema_destroy_table(session, &table)); } WT_TRET(__wt_metadata_cursor_release(session, &cursor)); __wt_free(session, tablename); __wt_scr_free(session, &buf); return (ret); }
/* * __wt_schema_open_colgroups -- * Open the column groups for a table. */ int __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table) { WT_COLGROUP *colgroup; WT_CONFIG cparser; WT_CONFIG_ITEM ckey, cval; WT_DECL_RET; WT_DECL_ITEM(buf); char *cgconfig; u_int i; WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE)); if (table->cg_complete) return (0); colgroup = NULL; cgconfig = NULL; WT_RET(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_config_subinit(session, &cparser, &table->cgconf)); /* Open each column group. */ for (i = 0; i < WT_COLGROUPS(table); i++) { if (table->ncolgroups > 0) WT_ERR(__wt_config_next(&cparser, &ckey, &cval)); else WT_CLEAR(ckey); /* * Always open from scratch: we may have failed part of the way * through opening a table, or column groups may have changed. */ __wt_schema_destroy_colgroup(session, &table->cgroups[i]); WT_ERR(__wt_buf_init(session, buf, 0)); WT_ERR(__wt_schema_colgroup_name(session, table, ckey.str, ckey.len, buf)); if ((ret = __wt_metadata_search( session, buf->data, &cgconfig)) != 0) { /* It is okay if the table is incomplete. */ if (ret == WT_NOTFOUND) ret = 0; goto err; } WT_ERR(__wt_calloc_one(session, &colgroup)); WT_ERR(__wt_strndup( session, buf->data, buf->size, &colgroup->name)); colgroup->config = cgconfig; cgconfig = NULL; WT_ERR(__wt_config_getones(session, colgroup->config, "columns", &colgroup->colconf)); WT_ERR(__wt_config_getones( session, colgroup->config, "source", &cval)); WT_ERR(__wt_strndup( session, cval.str, cval.len, &colgroup->source)); table->cgroups[i] = colgroup; colgroup = NULL; } if (!table->is_simple) { WT_ERR(__wt_table_check(session, table)); WT_ERR(__wt_buf_init(session, buf, 0)); WT_ERR(__wt_struct_plan(session, table, table->colconf.str, table->colconf.len, true, buf)); WT_ERR(__wt_strndup( session, buf->data, buf->size, &table->plan)); } table->cg_complete = true; err: __wt_scr_free(session, &buf); __wt_schema_destroy_colgroup(session, &colgroup); if (cgconfig != NULL) __wt_free(session, cgconfig); return (ret); }
/* * __schema_open_index -- * Open one or more indices for a table (internal version). */ static int __schema_open_index(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp) { WT_CURSOR *cursor; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_INDEX *idx; u_int i; int cmp; bool match; const char *idxconf, *name, *tablename, *uri; /* Check if we've already done the work. */ if (idxname == NULL && table->idx_complete) return (0); cursor = NULL; idx = NULL; match = false; /* Build a search key. */ tablename = table->name; (void)WT_PREFIX_SKIP(tablename, "table:"); WT_ERR(__wt_scr_alloc(session, 512, &tmp)); WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename)); /* Find matching indices. */ WT_ERR(__wt_metadata_cursor(session, &cursor)); cursor->set_key(cursor, tmp->data); if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0) ret = cursor->next(cursor); for (i = 0; ret == 0; i++, ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &uri)); name = uri; if (!WT_PREFIX_SKIP(name, tmp->data)) break; /* Is this the index we are looking for? */ match = idxname == NULL || WT_STRING_MATCH(name, idxname, len); /* * Ensure there is space, including if we have to make room for * a new entry in the middle of the list. */ WT_ERR(__wt_realloc_def(session, &table->idx_alloc, WT_MAX(i, table->nindices) + 1, &table->indices)); /* Keep the in-memory list in sync with the metadata. */ cmp = 0; while (table->indices[i] != NULL && (cmp = strcmp(uri, table->indices[i]->name)) > 0) { /* Index no longer exists, remove it. */ __wt_free(session, table->indices[i]); memmove(&table->indices[i], &table->indices[i + 1], (table->nindices - i) * sizeof(WT_INDEX *)); table->indices[--table->nindices] = NULL; } if (cmp < 0) { /* Make room for a new index. */ memmove(&table->indices[i + 1], &table->indices[i], (table->nindices - i) * sizeof(WT_INDEX *)); table->indices[i] = NULL; ++table->nindices; } if (!match) continue; if (table->indices[i] == NULL) { WT_ERR(cursor->get_value(cursor, &idxconf)); WT_ERR(__wt_calloc_one(session, &idx)); WT_ERR(__wt_strdup(session, uri, &idx->name)); WT_ERR(__wt_strdup(session, idxconf, &idx->config)); WT_ERR(__open_index(session, table, idx)); /* * If we're checking the creation of an index before a * table is fully created, don't save the index: it * will need to be reopened once the table is complete. */ if (!table->cg_complete) { WT_ERR( __wt_schema_destroy_index(session, &idx)); if (idxname != NULL) break; continue; } table->indices[i] = idx; idx = NULL; /* * If the slot is bigger than anything else we've seen, * bump the number of indices. */ if (i >= table->nindices) table->nindices = i + 1; } /* If we were looking for a single index, we're done. */ if (indexp != NULL) *indexp = table->indices[i]; if (idxname != NULL) break; } WT_ERR_NOTFOUND_OK(ret); if (idxname != NULL && !match) ret = WT_NOTFOUND; /* If we did a full pass, we won't need to do it again. */ if (idxname == NULL) { table->nindices = i; table->idx_complete = true; } err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); WT_TRET(__wt_schema_destroy_index(session, &idx)); __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_delete_page -- * If deleting a range, try to delete the page without instantiating it. */ int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_DECL_RET; WT_PAGE *parent; *skipp = false; /* If we have a clean page in memory, attempt to evict it. */ if (ref->state == WT_REF_MEM && __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { WT_PUBLISH(ref->state, WT_REF_MEM); return (0); } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); ret = __wt_evict_page(session, ref); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); } /* * Atomically switch the page's state to lock it. If the page is not * on-disk, other threads may be using it, no fast delete. * * Possible optimization: if the page is already deleted and the delete * is visible to us (the delete has been committed), we could skip the * page instead of instantiating it and figuring out there are no rows * in the page. While that's a huge amount of work to no purpose, it's * unclear optimizing for overlapping range deletes is worth the effort. */ if (ref->state != WT_REF_DISK || !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* * We cannot fast-delete pages that have overflow key/value items as * the overflow blocks have to be discarded. The way we figure that * out is to check the on-page cell type for the page, cells for leaf * pages that have no overflow items are special. * * In some cases, the reference address may not reference an on-page * cell (for example, some combination of page splits), in which case * we can't check the original cell value and we fail. * * To look at an on-page cell, we need to look at the parent page, and * that's dangerous, our parent page could change without warning if * the parent page were to split, deepening the tree. It's safe: the * page's reference will always point to some valid page, and if we find * any problems we simply fail the fast-delete optimization. * * !!! * I doubt it's worth the effort, but we could copy the cell's type into * the reference structure, and then we wouldn't need an on-page cell. */ parent = ref->home; if (__wt_off_page(parent, ref->addr) || __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO) goto err; /* * This action dirties the parent page: mark it dirty now, there's no * future reconciliation of the child leaf page that will dirty it as * we write the tree. */ WT_ERR(__wt_page_parent_modify_set(session, ref, false)); /* * Record the change in the transaction structure and set the change's * transaction ID. */ WT_ERR(__wt_calloc_one(session, &ref->page_del)); ref->page_del->txnid = session->txn.id; WT_ERR(__wt_txn_modify_ref(session, ref)); *skipp = true; WT_PUBLISH(ref->state, WT_REF_DELETED); return (0); err: __wt_free(session, ref->page_del); /* * Restore the page to on-disk status, we'll have to instantiate it. */ WT_PUBLISH(ref->state, WT_REF_DISK); return (ret); }
/* * __wt_delete_page_instantiate -- * Instantiate an entirely deleted row-store leaf page. */ int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_DELETED *page_del; WT_UPDATE **upd_array, *upd; size_t size; uint32_t i; btree = S2BT(session); page = ref->page; page_del = ref->page_del; /* * Give the page a modify structure. * * If the tree is already dirty and so will be written, mark the page * dirty. (We'd like to free the deleted pages, but if the handle is * read-only or if the application never modifies the tree, we're not * able to do so.) */ if (btree->modified) { WT_RET(__wt_page_modify_init(session, page)); __wt_page_modify_set(session, page); } /* * An operation is accessing a "deleted" page, and we're building an * in-memory version of the page (making it look like all entries in * the page were individually updated by a remove operation). There * are two cases where we end up here: * * First, a running transaction used a truncate call to delete the page * without reading it, in which case the page reference includes a * structure with a transaction ID; the page we're building might split * in the future, so we update that structure to include references to * all of the update structures we create, so the transaction can abort. * * Second, a truncate call deleted a page and the truncate committed, * but an older transaction in the system forced us to keep the old * version of the page around, then we crashed and recovered, and now * we're being forced to read that page. * * In the first case, we have a page reference structure, in the second * second, we don't. * * Allocate the per-reference update array; in the case of instantiating * a page, deleted by a running transaction that might eventually abort, * we need a list of the update structures so we can do that abort. The * hard case is if a page splits: the update structures might be moved * to different pages, and we still have to find them all for an abort. */ if (page_del != NULL) WT_RET(__wt_calloc_def( session, page->pg_row_entries + 1, &page_del->update_list)); /* Allocate the per-page update array. */ WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array)); page->pg_row_upd = upd_array; /* * Fill in the per-reference update array with references to update * structures, fill in the per-page update array with references to * deleted items. */ for (i = 0, size = 0; i < page->pg_row_entries; ++i) { WT_ERR(__wt_calloc_one(session, &upd)); WT_UPDATE_DELETED_SET(upd); if (page_del == NULL) upd->txnid = WT_TXN_NONE; /* Globally visible */ else { upd->txnid = page_del->txnid; page_del->update_list[i] = upd; } upd->next = upd_array[i]; upd_array[i] = upd; size += sizeof(WT_UPDATE *) + WT_UPDATE_MEMSIZE(upd); } __wt_cache_page_inmem_incr(session, page, size); return (0); err: /* * There's no need to free the page update structures on error, our * caller will discard the page and do that work for us. We could * similarly leave the per-reference update array alone because it * won't ever be used by any page that's not in-memory, but cleaning * it up makes sense, especially if we come back in to this function * attempting to instantiate this page again. */ if (page_del != NULL) __wt_free(session, page_del->update_list); return (ret); }