/* * __wt_connection_destroy -- * Destroy the connection's underlying WT_CONNECTION_IMPL structure. */ int __wt_connection_destroy(WT_CONNECTION_IMPL *conn) { WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; /* Check there's something to destroy. */ if (conn == NULL) return (0); session = conn->default_session; /* * Close remaining open files (before discarding the mutex, the * underlying file-close code uses the mutex to guard lists of * open files. */ WT_TRET(__wt_close(session, &conn->lock_fh)); /* Remove from the list of connections. */ __wt_spin_lock(session, &__wt_process.spinlock); TAILQ_REMOVE(&__wt_process.connqh, conn, q); __wt_spin_unlock(session, &__wt_process.spinlock); /* Configuration */ __wt_conn_config_discard(session); /* configuration */ __wt_conn_foc_discard(session); /* free-on-close */ __wt_spin_destroy(session, &conn->api_lock); __wt_spin_destroy(session, &conn->block_lock); __wt_spin_destroy(session, &conn->checkpoint_lock); __wt_spin_destroy(session, &conn->dhandle_lock); __wt_spin_destroy(session, &conn->encryptor_lock); __wt_spin_destroy(session, &conn->fh_lock); WT_TRET(__wt_rwlock_destroy(session, &conn->hot_backup_lock)); __wt_spin_destroy(session, &conn->las_lock); __wt_spin_destroy(session, &conn->metadata_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); __wt_spin_destroy(session, &conn->table_lock); __wt_spin_destroy(session, &conn->turtle_lock); for (i = 0; i < WT_PAGE_LOCKS; ++i) __wt_spin_destroy(session, &conn->page_lock[i]); __wt_free(session, conn->page_lock); /* Free allocated memory. */ __wt_free(session, conn->cfg); __wt_free(session, conn->home); __wt_free(session, conn->error_prefix); __wt_free(session, conn->sessions); __wt_free(NULL, conn); return (ret); }
/* * __wt_curfile_open -- * WT_SESSION->open_cursor method for the btree cursor type. */ int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CONFIG_ITEM cval; WT_DECL_RET; uint32_t flags; bool bitmap, bulk; bitmap = bulk = false; flags = 0; WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval)); if (cval.type == WT_CONFIG_ITEM_BOOL || (cval.type == WT_CONFIG_ITEM_NUM && (cval.val == 0 || cval.val == 1))) { bitmap = false; bulk = cval.val != 0; } else if (WT_STRING_MATCH("bitmap", cval.str, cval.len)) bitmap = bulk = true; else WT_RET_MSG(session, EINVAL, "Value for 'bulk' must be a boolean or 'bitmap'"); /* Bulk handles require exclusive access. */ if (bulk) LF_SET(WT_BTREE_BULK | WT_DHANDLE_EXCLUSIVE); /* Get the handle and lock it while the cursor is using it. */ if (WT_PREFIX_MATCH(uri, "file:")) { /* * If we are opening a bulk cursor, get the handle while * holding the checkpoint lock. This prevents a bulk cursor * open failing with EBUSY due to a database-wide checkpoint. */ if (bulk) __wt_spin_lock( session, &S2C(session)->checkpoint_lock); ret = __wt_session_get_btree_ckpt(session, uri, cfg, flags); if (bulk) __wt_spin_unlock( session, &S2C(session)->checkpoint_lock); WT_RET(ret); } else WT_RET(__wt_bad_object_type(session, uri)); WT_ERR(__wt_curfile_create(session, owner, cfg, bulk, bitmap, cursorp)); /* Increment the data-source's in-use counter. */ __wt_cursor_dhandle_incr_use(session); return (0); err: /* If the cursor could not be opened, release the handle. */ WT_TRET(__wt_session_release_btree(session)); return (ret); }
/* * __wt_conn_btree_sync_and_close -- * Sync and close the underlying btree handle. */ int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, int force) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; int no_schema_lock; dhandle = session->dhandle; btree = S2BT(session); if (!F_ISSET(dhandle, WT_DHANDLE_OPEN)) return (0); /* * If we don't already have the schema lock, make it an error to try * to acquire it. The problem is that we are holding an exclusive * lock on the handle, and if we attempt to acquire the schema lock * we might deadlock with a thread that has the schema lock and wants * a handle lock (specifically, checkpoint). */ no_schema_lock = 0; if (!F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { no_schema_lock = 1; F_SET(session, WT_SESSION_NO_SCHEMA_LOCK); } /* * We may not be holding the schema lock, and threads may be walking * the list of open handles (for example, checkpoint). Acquire the * handle's close lock. */ __wt_spin_lock(session, &dhandle->close_lock); /* * The close can fail if an update cannot be written, return the EBUSY * error to our caller for eventual retry. */ if (!F_ISSET(btree, WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) WT_ERR(__wt_checkpoint_close(session, force)); if (dhandle->checkpoint == NULL) --S2C(session)->open_btree_count; WT_TRET(__wt_btree_close(session)); F_CLR(dhandle, WT_DHANDLE_OPEN); F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); err: __wt_spin_unlock(session, &dhandle->close_lock); if (no_schema_lock) F_CLR(session, WT_SESSION_NO_SCHEMA_LOCK); return (ret); }
/* * __wt_block_compact_skip -- * Return if compaction will shrink the file. */ int __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp) { WT_DECL_RET; WT_EXT *ext; WT_EXTLIST *el; WT_FH *fh; off_t avail, ninety; *skipp = 1; /* Return a default skip. */ fh = block->fh; /* * We do compaction by copying blocks from the end of the file to the * beginning of the file, and we need some metrics to decide if it's * worth doing. Ignore small files, and files where we are unlikely * to recover 10% of the file. */ if (fh->size <= 10 * 1024) return (0); __wt_spin_lock(session, &block->live_lock); if (WT_VERBOSE_ISSET(session, compact)) WT_ERR(__block_dump_avail(session, block)); /* Sum the number of available bytes in the first 90% of the file. */ avail = 0; ninety = fh->size - fh->size / 10; el = &block->live.avail; WT_EXT_FOREACH(ext, el->off) if (ext->off < ninety) avail += ext->size; /* * If at least 10% of the total file is available and in the first 90% * of the file, we'll try compaction. */ if (avail >= fh->size / 10) *skipp = 0; WT_VERBOSE_ERR(session, compact, "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " "90%% of the file, require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") to perform compaction, compaction %s", block->name, (uintmax_t)avail / WT_MEGABYTE, (uintmax_t)avail, (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10, *skipp ? "skipped" : "proceeding"); err: __wt_spin_unlock(session, &block->live_lock); return (ret); }
/* * __wt_connection_destroy -- * Destroy the connection's underlying WT_CONNECTION_IMPL structure. */ int __wt_connection_destroy(WT_CONNECTION_IMPL *conn) { WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; /* Check there's something to destroy. */ if (conn == NULL) return (0); session = conn->default_session; /* Remove from the list of connections. */ __wt_spin_lock(session, &__wt_process.spinlock); TAILQ_REMOVE(&__wt_process.connqh, conn, q); __wt_spin_unlock(session, &__wt_process.spinlock); /* Configuration */ __wt_conn_config_discard(session); /* configuration */ __wt_conn_foc_discard(session); /* free-on-close */ __wt_spin_destroy(session, &conn->api_lock); __wt_spin_destroy(session, &conn->block_lock); __wt_spin_destroy(session, &conn->checkpoint_lock); __wt_spin_destroy(session, &conn->dhandle_lock); __wt_spin_destroy(session, &conn->encryptor_lock); __wt_spin_destroy(session, &conn->fh_lock); __wt_rwlock_destroy(session, &conn->hot_backup_lock); __wt_spin_destroy(session, &conn->las_lock); __wt_spin_destroy(session, &conn->metadata_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); __wt_spin_destroy(session, &conn->table_lock); __wt_spin_destroy(session, &conn->turtle_lock); for (i = 0; i < WT_PAGE_LOCKS; ++i) __wt_spin_destroy(session, &conn->page_lock[i]); __wt_free(session, conn->page_lock); /* Destroy the file-system configuration. */ if (conn->file_system != NULL && conn->file_system->terminate != NULL) WT_TRET(conn->file_system->terminate( conn->file_system, (WT_SESSION *)session)); /* Free allocated memory. */ __wt_free(session, conn->cfg); __wt_free(session, conn->home); __wt_free(session, conn->error_prefix); __wt_free(session, conn->sessions); __wt_stat_connection_discard(session, conn); __wt_free(NULL, conn); return (ret); }
/* * __wt_log_slot_grow_buffers -- * Increase the buffer size of all available slots in the buffer pool. * Go to some lengths to include active (but unused) slots to handle * the case where all log write record sizes exceed the size of the * active buffer. */ int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOGSLOT *slot; int64_t orig_state; uint64_t old_size, total_growth; int i; conn = S2C(session); log = conn->log; total_growth = 0; WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); /* * Take the log slot lock to prevent other threads growing buffers * at the same time. Could tighten the scope of this lock, or have * a separate lock if there is contention. */ __wt_spin_lock(session, &log->log_slot_lock); for (i = 0; i < SLOT_POOL; i++) { slot = &log->slot_pool[i]; /* Avoid atomic operations if they won't succeed. */ if (slot->slot_state != WT_LOG_SLOT_FREE && slot->slot_state != WT_LOG_SLOT_READY) continue; /* Don't keep growing unrelated buffers. */ if (slot->slot_buf.memsize > (10 * newsize) && !F_ISSET(slot, SLOT_BUF_GROW)) continue; orig_state = WT_ATOMIC_CAS_VAL8( slot->slot_state, WT_LOG_SLOT_FREE, WT_LOG_SLOT_PENDING); if (orig_state != WT_LOG_SLOT_FREE) { orig_state = WT_ATOMIC_CAS_VAL8(slot->slot_state, WT_LOG_SLOT_READY, WT_LOG_SLOT_PENDING); if (orig_state != WT_LOG_SLOT_READY) continue; } /* We have a slot - now go ahead and grow the buffer. */ old_size = slot->slot_buf.memsize; F_CLR(slot, SLOT_BUF_GROW); WT_ERR(__wt_buf_grow(session, &slot->slot_buf, WT_MAX(slot->slot_buf.memsize * 2, newsize))); slot->slot_state = orig_state; total_growth += slot->slot_buf.memsize - old_size; } err: __wt_spin_unlock(session, &log->log_slot_lock); WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, total_growth); return (ret); }
/* * __conn_load_extension -- * WT_CONNECTION->load_extension method. */ static int __conn_load_extension( WT_CONNECTION *wt_conn, const char *path, const char *config) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_DLH *dlh; WT_SESSION_IMPL *session; int (*load)(WT_CONNECTION *, WT_CONFIG_ARG *); const char *init_name, *terminate_name; dlh = NULL; init_name = terminate_name = NULL; conn = (WT_CONNECTION_IMPL *)wt_conn; CONNECTION_API_CALL(conn, session, load_extension, config, cfg); WT_ERR(__wt_config_gets(session, cfg, "entry", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &init_name)); /* * This assumes the underlying shared libraries are reference counted, * that is, that re-opening a shared library simply increments a ref * count, and closing it simply decrements the ref count, and the last * close discards the reference entirely -- in other words, we do not * check to see if we've already opened this shared library. * * Fill in the extension structure and call the load function. */ WT_ERR(__wt_dlopen(session, path, &dlh)); WT_ERR(__wt_dlsym(session, dlh, init_name, 1, &load)); WT_ERR(load(wt_conn, (WT_CONFIG_ARG *)cfg)); /* Remember the unload function for when we close. */ WT_ERR(__wt_config_gets(session, cfg, "terminate", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &terminate_name)); WT_ERR(__wt_dlsym(session, dlh, terminate_name, 0, &dlh->terminate)); /* Link onto the environment's list of open libraries. */ __wt_spin_lock(session, &conn->api_lock); TAILQ_INSERT_TAIL(&conn->dlhqh, dlh, q); __wt_spin_unlock(session, &conn->api_lock); dlh = NULL; err: if (dlh != NULL) WT_TRET(__wt_dlclose(session, dlh)); __wt_free(session, init_name); __wt_free(session, terminate_name); API_END_NOTFOUND_MAP(session, ret); }
/* * __wt_handle_search -- * Search for a matching handle. */ bool __wt_handle_search(WT_SESSION_IMPL *session, const char *name, bool increment_ref, WT_FH *newfh, WT_FH **fhp) { WT_CONNECTION_IMPL *conn; WT_FH *fh; uint64_t bucket, hash; bool found; if (fhp != NULL) *fhp = NULL; conn = S2C(session); found = false; hash = __wt_hash_city64(name, strlen(name)); bucket = hash % WT_HASH_ARRAY_SIZE; __wt_spin_lock(session, &conn->fh_lock); /* * If we already have the file open, optionally increment the reference * count and return a pointer. */ TAILQ_FOREACH(fh, &conn->fhhash[bucket], hashq) if (strcmp(name, fh->name) == 0) { if (increment_ref) ++fh->ref; if (fhp != NULL) *fhp = fh; found = true; break; } /* If we don't find a match, optionally add a new entry. */ if (!found && newfh != NULL) { newfh->name_hash = hash; WT_CONN_FILE_INSERT(conn, newfh, bucket); (void)__wt_atomic_add32(&conn->open_file_count, 1); if (increment_ref) ++newfh->ref; if (fhp != NULL) *fhp = newfh; } __wt_spin_unlock(session, &conn->fh_lock); return (found); }
/* * __wt_block_compact_skip -- * Return if compaction will shrink the file. */ int __wt_block_compact_skip( WT_SESSION_IMPL *session, WT_BLOCK *block, int trigger, int *skipp) { WT_EXT *ext; WT_EXTLIST *el; WT_FH *fh; off_t avail, half; int pct; fh = block->fh; *skipp = 1; /* * We do compaction by copying blocks from the end of the file to the * beginning of the file, and we need some metrics to decide if it's * worth doing. Ignore small files, and files where we are unlikely * to recover the specified percentage of the file. (The calculation * is if at least N % of the file appears in the available list, and * in the first half of the file. In other words, don't bother with * compaction unless we have an expectation of moving N % of the file * from the last half of the file to the first half of the file.) */ if (fh->size <= 10 * 1024) return (0); __wt_spin_lock(session, &block->live_lock); avail = 0; half = fh->size / 2; el = &block->live.avail; WT_EXT_FOREACH(ext, el->off) if (ext->off < half) avail += ext->size; pct = (int)((avail * 100) / fh->size); __wt_spin_unlock(session, &block->live_lock); if (pct >= trigger) *skipp = 0; WT_VERBOSE_RET(session, block, "%s: compaction %s, %d%% of the free space in the available " "list appears in the first half of the file", block->name, pct < trigger ? "skipped" : "proceeding", pct); return (0); }
/* * __wt_connection_destroy -- * Destroy the connection's underlying WT_CONNECTION_IMPL structure. */ int __wt_connection_destroy(WT_CONNECTION_IMPL *conn) { WT_DECL_RET; WT_SESSION_IMPL *session; /* Check there's something to destroy. */ if (conn == NULL) return (0); session = conn->default_session; /* * Close remaining open files (before discarding the mutex, the * underlying file-close code uses the mutex to guard lists of * open files. */ if (conn->lock_fh != NULL) WT_TRET(__wt_close(session, conn->lock_fh)); if (conn->log_fh != NULL) WT_TRET(__wt_close(session, conn->log_fh)); /* Remove from the list of connections. */ __wt_spin_lock(session, &__wt_process.spinlock); TAILQ_REMOVE(&__wt_process.connqh, conn, q); __wt_spin_unlock(session, &__wt_process.spinlock); /* Configuration */ __wt_conn_config_discard(session); /* configuration */ __wt_conn_foc_discard(session); /* free-on-close */ __wt_spin_destroy(session, &conn->api_lock); __wt_spin_destroy(session, &conn->block_lock); __wt_spin_destroy(session, &conn->checkpoint_lock); __wt_spin_destroy(session, &conn->fh_lock); __wt_spin_destroy(session, &conn->hot_backup_lock); __wt_spin_destroy(session, &conn->schema_lock); __wt_spin_destroy(session, &conn->serial_lock); /* Free allocated memory. */ __wt_free(session, conn->home); __wt_free(session, conn->error_prefix); __wt_free(session, conn->sessions); __wt_free(NULL, conn); return (ret); }
/* * __wt_close -- * Close a file handle. */ int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *fh; uint64_t bucket; conn = S2C(session); if (*fhp == NULL) return (0); fh = *fhp; *fhp = NULL; __wt_spin_lock(session, &conn->fh_lock); if (fh == NULL || fh->ref == 0 || --fh->ref > 0) { __wt_spin_unlock(session, &conn->fh_lock); return (0); } /* Remove from the list. */ bucket = fh->name_hash % WT_HASH_ARRAY_SIZE; WT_CONN_FILE_REMOVE(conn, fh, bucket); (void)WT_ATOMIC_SUB4(conn->open_file_count, 1); __wt_spin_unlock(session, &conn->fh_lock); /* Discard the memory. * Note: For directories, we do not open valid directory handles on * windows since it is not possible to sync a directory */ if (fh->filehandle != INVALID_HANDLE_VALUE && CloseHandle(fh->filehandle) == 0) { ret = __wt_errno(); __wt_err(session, ret, "CloseHandle: %s", fh->name); } if (fh->filehandle_secondary != INVALID_HANDLE_VALUE && CloseHandle(fh->filehandle_secondary) == 0) { ret = __wt_errno(); __wt_err(session, ret, "CloseHandle: secondary: %s", fh->name); } __wt_free(session, fh->name); __wt_free(session, fh); return (ret); }
/* * __wt_block_compact_end -- * End compaction of a file. */ int __wt_block_compact_end(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_UNUSED(session); /* * Restore the previous allocation plan. * We don't need the lock, but it's not a performance question and * might avoid bugs in the future. */ __wt_spin_lock(session, &block->live_lock); block->allocfirst = block->allocfirst_save; __wt_spin_unlock(session, &block->live_lock); return (0); }
/* * __conn_load_extension -- * WT_CONNECTION->load_extension method. */ static int __conn_load_extension( WT_CONNECTION *wt_conn, const char *path, const char *config) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_DLH *dlh; WT_SESSION_IMPL *session; int (*entry)(WT_SESSION *, WT_EXTENSION_API *, const char *); const char *entry_name; dlh = NULL; conn = (WT_CONNECTION_IMPL *)wt_conn; CONNECTION_API_CALL(conn, session, load_extension, config, cfg); entry_name = NULL; WT_ERR(__wt_config_gets(session, cfg, "entry", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &entry_name)); /* * This assumes the underlying shared libraries are reference counted, * that is, that re-opening a shared library simply increments a ref * count, and closing it simply decrements the ref count, and the last * close discards the reference entirely -- in other words, we do not * check to see if we've already opened this shared library. */ WT_ERR(__wt_dlopen(session, path, &dlh)); WT_ERR(__wt_dlsym(session, dlh, entry_name, &entry)); /* Call the entry function. */ WT_ERR(entry(&session->iface, &__api, config)); /* Link onto the environment's list of open libraries. */ __wt_spin_lock(session, &conn->api_lock); TAILQ_INSERT_TAIL(&conn->dlhqh, dlh, q); __wt_spin_unlock(session, &conn->api_lock); if (0) { err: if (dlh != NULL) WT_TRET(__wt_dlclose(session, dlh)); } __wt_free(session, entry_name); API_END_NOTFOUND_MAP(session, ret); }
/* * __wt_close -- * Close a file handle. */ int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *fh; uint64_t bucket; conn = S2C(session); if (*fhp == NULL) return (0); fh = *fhp; *fhp = NULL; /* Track handle-close as a file operation, so open and close match. */ WT_RET(__wt_verbose( session, WT_VERB_FILEOPS, "%s: handle-close", fh->name)); /* * If the reference count hasn't gone to 0, or if it's an in-memory * object, we're done. * * Assert the reference count is correct, but don't let it wrap. */ __wt_spin_lock(session, &conn->fh_lock); WT_ASSERT(session, fh->ref > 0); if ((fh->ref > 0 && --fh->ref > 0) || F_ISSET(fh, WT_FH_IN_MEMORY)) { __wt_spin_unlock(session, &conn->fh_lock); return (0); } /* Remove from the list. */ bucket = fh->name_hash % WT_HASH_ARRAY_SIZE; WT_CONN_FILE_REMOVE(conn, fh, bucket); (void)__wt_atomic_sub32(&conn->open_file_count, 1); __wt_spin_unlock(session, &conn->fh_lock); /* Discard underlying resources. */ ret = fh->fh_close(session, fh); __wt_free(session, fh->name); __wt_free(session, fh); return (ret); }
/* * __wt_block_compact_start -- * Start compaction of a file. */ int __wt_block_compact_start(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_UNUSED(session); /* * Save the current allocation plan, switch to first-fit allocation. * We don't need the lock, but it's not a performance question and * might avoid bugs in the future. */ __wt_spin_lock(session, &block->live_lock); block->allocfirst_save = block->allocfirst; block->allocfirst = 1; __wt_spin_unlock(session, &block->live_lock); return (0); }
/* * __backup_stop -- * Stop a backup. */ static int __backup_stop(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; conn = S2C(session); /* Remove any backup specific file. */ ret = __backup_file_remove(session); /* Checkpoint deletion can proceed, as can the next hot backup. */ __wt_spin_lock(session, &conn->hot_backup_lock); conn->hot_backup = 0; __wt_spin_unlock(session, &conn->hot_backup_lock); return (ret); }
/* * __wt_las_cursor -- * Return a lookaside cursor. */ int __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; *cursorp = NULL; /* * We don't want to get tapped for eviction after we start using the * lookaside cursor; save a copy of the current eviction state, we'll * turn eviction off before we return. * * Don't cache lookaside table pages, we're here because of eviction * problems and there's no reason to believe lookaside pages will be * useful more than once. */ *session_flags = F_ISSET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); conn = S2C(session); /* Eviction and sweep threads have their own lookaside table cursors. */ if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) { if (session->las_cursor == NULL) { WT_WITHOUT_DHANDLE(session, ret = __las_cursor_create(session, &session->las_cursor)); WT_RET(ret); } *cursorp = session->las_cursor; } else { /* Lock the shared lookaside cursor. */ __wt_spin_lock(session, &conn->las_lock); *cursorp = conn->las_cursor; } /* Turn caching and eviction off. */ F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (0); }
/* * __wt_block_stat -- * Block statistics */ void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats) { /* * We're looking inside the live system's structure, which normally * requires locking: the chances of a corrupted read are probably * non-existent, and it's statistics information regardless, but it * isn't like this is a common function for an application to call. */ __wt_spin_lock(session, &block->live_lock); WT_STAT_SET(stats, allocation_size, block->allocsize); WT_STAT_SET(stats, block_checkpoint_size, block->live.ckpt_size); WT_STAT_SET(stats, block_magic, WT_BLOCK_MAGIC); WT_STAT_SET(stats, block_major, WT_BLOCK_MAJOR_VERSION); WT_STAT_SET(stats, block_minor, WT_BLOCK_MINOR_VERSION); WT_STAT_SET(stats, block_reuse_bytes, block->live.avail.bytes); WT_STAT_SET(stats, block_size, block->fh->size); __wt_spin_unlock(session, &block->live_lock); }
/* * __wt_connection_destroy -- * Destroy the connection's underlying WT_CONNECTION_IMPL structure. */ void __wt_connection_destroy(WT_CONNECTION_IMPL *conn) { WT_SESSION_IMPL *session; /* Check there's something to destroy. */ if (conn == NULL) return; session = conn->default_session; /* Remove from the list of connections. */ __wt_spin_lock(session, &__wt_process.spinlock); TAILQ_REMOVE(&__wt_process.connqh, conn, q); __wt_spin_unlock(session, &__wt_process.spinlock); /* Configuration */ __wt_conn_config_discard(session); /* configuration */ __wt_conn_foc_discard(session); /* free-on-close */ __wt_spin_destroy(session, &conn->api_lock); __wt_spin_destroy(session, &conn->block_lock); __wt_spin_destroy(session, &conn->checkpoint_lock); __wt_rwlock_destroy(session, &conn->dhandle_lock); __wt_spin_destroy(session, &conn->encryptor_lock); __wt_spin_destroy(session, &conn->fh_lock); __wt_rwlock_destroy(session, &conn->hot_backup_lock); __wt_spin_destroy(session, &conn->metadata_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); __wt_rwlock_destroy(session, &conn->table_lock); __wt_spin_destroy(session, &conn->turtle_lock); /* Free allocated memory. */ __wt_free(session, conn->cfg); __wt_free(session, conn->home); __wt_free(session, conn->error_prefix); __wt_free(session, conn->sessions); __wt_stat_connection_discard(session, conn); __wt_free(NULL, conn); }
/* * __wt_close -- * Close a file handle. */ int __wt_close(WT_SESSION_IMPL *session, WT_FH *fh) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; conn = S2C(session); __wt_spin_lock(session, &conn->fh_lock); if (fh == NULL || fh->ref == 0 || --fh->ref > 0) { __wt_spin_unlock(session, &conn->fh_lock); return (0); } /* Remove from the list. */ TAILQ_REMOVE(&conn->fhqh, fh, q); WT_STAT_FAST_CONN_DECR(session, file_open); __wt_spin_unlock(session, &conn->fh_lock); /* Discard the memory. * Note: For directories, we do not open valid directory handles on * windows since it is not possible to sync a directory */ if (fh->filehandle != INVALID_HANDLE_VALUE && !CloseHandle(fh->filehandle) != 0) { ret = __wt_errno(); __wt_err(session, ret, "CloseHandle: %s", fh->name); } if (fh->filehandle_secondary != INVALID_HANDLE_VALUE && !CloseHandle(fh->filehandle_secondary) != 0) { ret = __wt_errno(); __wt_err(session, ret, "CloseHandle: secondary: %s", fh->name); } __wt_free(session, fh->name); __wt_free(session, fh); return (ret); }
/* * __wt_block_checkpoint_start -- * Start a checkpoint. */ int __wt_block_checkpoint_start(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_DECL_RET; __wt_spin_lock(session, &block->live_lock); switch (block->ckpt_state) { case WT_CKPT_INPROGRESS: case WT_CKPT_PANIC_ON_FAILURE: case WT_CKPT_SALVAGE: __wt_err(session, EINVAL, "%s: an unexpected checkpoint start: the checkpoint " "has already started or was configured for salvage", block->name); ret = __wt_block_panic(session); break; case WT_CKPT_NONE: block->ckpt_state = WT_CKPT_INPROGRESS; break; } __wt_spin_unlock(session, &block->live_lock); return (ret); }
/* * __wt_optrack_record_funcid -- * Allocate and record optrack function ID. */ void __wt_optrack_record_funcid( WT_SESSION_IMPL *session, const char *func, uint16_t *func_idp) { static uint16_t optrack_uid = 0; /* Unique for the process lifetime. */ WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(tmp); WT_DECL_RET; wt_off_t fsize; bool locked; conn = S2C(session); locked = false; WT_ERR(__wt_scr_alloc(session, strlen(func) + 32, &tmp)); __wt_spin_lock(session, &conn->optrack_map_spinlock); locked = true; if (*func_idp == 0) { *func_idp = ++optrack_uid; WT_ERR(__wt_buf_fmt( session, tmp, "%" PRIu16 " %s\n", *func_idp, func)); WT_ERR(__wt_filesize(session, conn->optrack_map_fh, &fsize)); WT_ERR(__wt_write(session, conn->optrack_map_fh, fsize, tmp->size, tmp->data)); } if (0) { err: WT_PANIC_MSG(session, ret, "operation tracking initialization failure"); } if (locked) __wt_spin_unlock(session, &conn->optrack_map_spinlock); __wt_scr_free(session, &tmp); }
/* * __wt_block_snapshot_resolve -- * Resolve a snapshot. */ int __wt_block_snapshot_resolve( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase) { WT_BLOCK_SNAPSHOT *si; WT_DECL_RET; si = &block->live; /* * Snapshots are a two-step process: first, we write a new snapshot to * disk (including all the new extent lists for modified snapshots and * the live system). As part of this we create a list of file blocks * newly available for re-allocation, based on snapshots being deleted. * We then return the locations of the new snapshot information to our * caller. Our caller has to write that information into some kind of * stable storage, and once that's done, we can actually allocate from * that list of newly available file blocks. (We can't allocate from * that list immediately because the allocation might happen before our * caller saves the new snapshot information, and if we crashed before * the new snapshot information was saved, we'd have overwritten blocks * still referenced by snapshots in the system.) In summary, there is * a second step, after our caller saves the snapshot information, we * are called to add the newly available blocks into the live system's * available list. */ __wt_spin_lock(session, &block->live_lock); ret = __wt_block_extlist_merge(session, &si->snapshot_avail, &si->avail); __wt_spin_unlock(session, &block->live_lock); /* Discard the list. */ __wt_block_extlist_free(session, &si->snapshot_avail); WT_UNUSED(snapbase); return (ret); }
/* * __wt_block_ckpt_init -- * Initialize a checkpoint structure. */ int __wt_block_ckpt_init(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci, const char *name, int is_live) { WT_DECL_RET; /* * If we're loading a new live checkpoint, there shouldn't be one * already loaded. The btree engine should prevent this from ever * happening, but paranoia is a healthy thing. */ if (is_live) { __wt_spin_lock(session, &block->live_lock); if (block->live_load) ret = EINVAL; else block->live_load = 1; __wt_spin_unlock(session, &block->live_lock); if (ret) WT_RET_MSG( session, EINVAL, "checkpoint already loaded"); } memset(ci, 0, sizeof(*ci)); ci->root_offset = WT_BLOCK_INVALID_OFFSET; WT_RET(__wt_block_extlist_init(session, &ci->alloc, name, "alloc")); WT_RET(__wt_block_extlist_init(session, &ci->avail, name, "avail")); WT_RET(__wt_block_extlist_init(session, &ci->discard, name, "discard")); ci->file_size = WT_BLOCK_DESC_SECTOR; WT_RET(__wt_block_extlist_init( session, &ci->ckpt_avail, name, "ckpt_avail")); return (0); }
/* * __wt_block_compact_page_skip -- * Return if writing a particular page will shrink the file. */ int __wt_block_compact_page_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, int *skipp) { WT_DECL_RET; WT_EXT *ext; WT_EXTLIST *el; WT_FH *fh; off_t ninety, offset; uint32_t size, cksum; WT_UNUSED(addr_size); *skipp = 1; /* Return a default skip. */ fh = block->fh; /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); __wt_spin_lock(session, &block->live_lock); /* * If this block is in the last 10% of the file and there's a block on * the available list that's in the first 90% of the file, rewrite the * block. Checking the available list is necessary (otherwise writing * the block would extend the file), but there's an obvious race if the * file is sufficiently busy. */ ninety = fh->size - fh->size / 10; if (offset > ninety) { el = &block->live.avail; WT_EXT_FOREACH(ext, el->off) if (ext->off < ninety && ext->size >= size) { *skipp = 0; break; } }
/* * __wt_las_cursor -- * Return a lookaside cursor. */ void __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags) { WT_CONNECTION_IMPL *conn; *cursorp = NULL; /* * We don't want to get tapped for eviction after we start using the * lookaside cursor; save a copy of the current eviction state, we'll * turn eviction off before we return. * * Don't cache lookaside table pages, we're here because of eviction * problems and there's no reason to believe lookaside pages will be * useful more than once. */ *session_flags = F_MASK(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); conn = S2C(session); /* * Some threads have their own lookaside table cursors, else lock the * shared lookaside cursor. */ if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) *cursorp = session->las_cursor; else { __wt_spin_lock(session, &conn->las_lock); *cursorp = conn->las_session->las_cursor; } /* Turn caching and eviction off. */ F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); }
/* * __wt_block_close -- * Close a block handle. */ int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; if (block == NULL) /* Safety check */ return (0); conn = S2C(session); WT_TRET(__wt_verbose(session, WT_VERB_BLOCK, "close: %s", block->name == NULL ? "" : block->name )); __wt_spin_lock(session, &conn->block_lock); /* Reference count is initialized to 1. */ if (block->ref == 0 || --block->ref == 0) WT_TRET(__block_destroy(session, block)); __wt_spin_unlock(session, &conn->block_lock); return (ret); }
/* * __wt_log_wrlsn -- * Process written log slots and attempt to coalesce them if the LSNs * are contiguous. The purpose of this function is to advance the * write_lsn in LSN order after the buffer is written to the log file. */ int __wt_log_wrlsn(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL]; WT_LOGSLOT *coalescing, *slot; WT_LSN save_lsn; size_t written_i; uint32_t i, save_i; conn = S2C(session); log = conn->log; __wt_spin_lock(session, &log->log_writelsn_lock); restart: coalescing = NULL; WT_INIT_LSN(&save_lsn); written_i = 0; i = 0; /* * Walk the array once saving any slots that are in the * WT_LOG_SLOT_WRITTEN state. */ while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; /* * XXX - During debugging I saw slot 0 become orphaned. * I believe it is fixed, but check for now. * This assertion should catch that. */ if (slot->slot_state == 0) WT_ASSERT(session, slot->slot_release_lsn.file >= log->write_lsn.file); if (slot->slot_state != WT_LOG_SLOT_WRITTEN) continue; written[written_i].slot_index = save_i; written[written_i++].lsn = slot->slot_release_lsn; } /* * If we found any written slots process them. We sort them * based on the release LSN, and then look for them in order. */ if (written_i > 0) { WT_INSERTION_SORT(written, written_i, WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT); /* * We know the written array is sorted by LSN. Go * through them either advancing write_lsn or coalesce * contiguous ranges of written slots. */ for (i = 0; i < written_i; i++) { slot = &log->slot_pool[written[i].slot_index]; /* * The log server thread pushes out slots periodically. * Sometimes they are empty slots. If we find an * empty slot, where empty means the start and end LSN * are the same, free it and continue. */ if (__wt_log_cmp(&slot->slot_start_lsn, &slot->slot_release_lsn) == 0 && __wt_log_cmp(&slot->slot_start_lsn, &slot->slot_end_lsn) == 0) { __wt_log_slot_free(session, slot); continue; } if (coalescing != NULL) { /* * If the write_lsn changed, we may be able to * process slots. Try again. */ if (__wt_log_cmp( &log->write_lsn, &save_lsn) != 0) goto restart; if (__wt_log_cmp(&coalescing->slot_end_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; } /* * If we get here we have a slot to coalesce * and free. */ coalescing->slot_last_offset = slot->slot_last_offset; coalescing->slot_end_lsn = slot->slot_end_lsn; WT_STAT_FAST_CONN_INCR( session, log_slot_coalesced); /* * Copy the flag for later closing. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) F_SET(coalescing, WT_SLOT_CLOSEFH); } else { /* * If this written slot is not the next LSN, * try to start coalescing with later slots. * A synchronous write may update write_lsn * so save the last one we saw to check when * coalescing slots. */ save_lsn = log->write_lsn; if (__wt_log_cmp( &log->write_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; } /* * If we get here we have a slot to process. * Advance the LSN and process the slot. */ WT_ASSERT(session, __wt_log_cmp(&written[i].lsn, &slot->slot_release_lsn) == 0); if (slot->slot_start_lsn.offset != slot->slot_last_offset) slot->slot_start_lsn.offset = slot->slot_last_offset; log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; WT_ERR(__wt_cond_signal( session, log->log_write_cond)); WT_STAT_FAST_CONN_INCR(session, log_write_lsn); /* * Signal the close thread if needed. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) WT_ERR(__wt_cond_signal( session, conn->log_file_cond)); } __wt_log_slot_free(session, slot); } } err: __wt_spin_unlock(session, &log->log_writelsn_lock); return (ret); }
/* * __log_file_server -- * The log file server thread. This worker thread manages * log file operations such as closing and syncing. */ static WT_THREAD_RET __log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; WT_LSN close_end_lsn, min_lsn; WT_SESSION_IMPL *session; uint32_t filenum; int locked; session = arg; conn = S2C(session); log = conn->log; locked = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ if ((close_fh = log->log_close_fh) != NULL) { WT_ERR(__wt_log_extract_lognum(session, close_fh->name, &filenum)); /* * We update the close file handle before updating the * close LSN when changing files. It is possible we * could see mismatched settings. If we do, yield * until it is set. This should rarely happen. */ while (log->log_close_lsn.file < filenum) __wt_yield(); if (__wt_log_cmp( &log->write_lsn, &log->log_close_lsn) >= 0) { /* * We've copied the file handle, clear out the * one in the log structure to allow it to be * set again. Copy the LSN before clearing * the file handle. * Use a barrier to make sure the compiler does * not reorder the following two statements. */ close_end_lsn = log->log_close_lsn; WT_FULL_BARRIER(); log->log_close_fh = NULL; /* * Set the close_end_lsn to the LSN immediately * after ours. That is, the beginning of the * next log file. We need to know the LSN * file number of our own close in case earlier * calls are still in progress and the next one * to move the sync_lsn into the next file for * later syncs. */ close_end_lsn.file++; close_end_lsn.offset = 0; WT_ERR(__wt_fsync(session, close_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; WT_ERR(__wt_close(session, &close_fh)); WT_ASSERT(session, __wt_log_cmp( &close_end_lsn, &log->sync_lsn) >= 0); log->sync_lsn = close_end_lsn; WT_ERR(__wt_cond_signal( session, log->log_sync_cond)); locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } } /* * If a later thread asked for a background sync, do it now. */ if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* * We have to wait until the LSN we asked for is * written. If it isn't signal the wrlsn thread * to get it written. */ if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) { WT_ERR(__wt_fsync(session, log->log_fh)); __wt_spin_lock(session, &log->log_sync_lock); locked = 1; /* * The sync LSN could have advanced while we * were writing to disk. */ if (__wt_log_cmp( &log->sync_lsn, &min_lsn) <= 0) { log->sync_lsn = min_lsn; WT_ERR(__wt_cond_signal( session, log->log_sync_cond)); } locked = 0; __wt_spin_unlock(session, &log->log_sync_lock); } else { WT_ERR(__wt_cond_signal( session, conn->log_wrlsn_cond)); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn * thread a chance to run and try again in * this case. */ __wt_yield(); continue; } } /* Wait until the next event. */ WT_ERR(__wt_cond_wait( session, conn->log_file_cond, WT_MILLION)); } if (0) { err: __wt_err(session, ret, "log close server error"); } if (locked) __wt_spin_unlock(session, &log->log_sync_lock); return (WT_THREAD_RET_VALUE); }
/* * __wt_block_checkpoint_load -- * Load a checkpoint. */ int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint) { WT_BLOCK_CKPT *ci, _ci; WT_DECL_ITEM(tmp); WT_DECL_RET; uint8_t *endp; ci = NULL; /* * Sometimes we don't find a root page (we weren't given a checkpoint, * or the checkpoint was empty). In that case we return an empty root * address, set that up now. */ *root_addr_sizep = 0; #ifdef HAVE_VERBOSE if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { if (addr != NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string(session, block, addr, tmp)); } __wt_verbose(session, WT_VERB_CHECKPOINT, "%s: load-checkpoint: %s", block->name, addr == NULL ? "[Empty]" : (const char *)tmp->data); } #endif /* * There's a single checkpoint in the file that can be written, all of * the others are read-only. We use the same initialization calls for * readonly checkpoints, but the information doesn't persist. */ if (checkpoint) { ci = &_ci; WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint")); } else { /* * We depend on the btree level for locking: things will go bad * fast if we open the live system in two handles, or salvage, * truncate or verify the live/running file. */ #ifdef HAVE_DIAGNOSTIC __wt_spin_lock(session, &block->live_lock); WT_ASSERT(session, block->live_open == false); block->live_open = true; __wt_spin_unlock(session, &block->live_lock); #endif ci = &block->live; WT_ERR(__wt_block_ckpt_init(session, ci, "live")); } /* * If the checkpoint has an on-disk root page, load it. Otherwise, size * the file past the description information. */ if (addr == NULL || addr_size == 0) ci->file_size = block->allocsize; else { /* Crack the checkpoint cookie. */ WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci)); /* Verify sets up next. */ if (block->verify) WT_ERR(__wt_verify_ckpt_load(session, block, ci)); /* Read any root page. */ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) { endp = root_addr; WT_ERR(__wt_block_addr_to_buffer(block, &endp, ci->root_offset, ci->root_size, ci->root_checksum)); *root_addr_sizep = WT_PTRDIFF(endp, root_addr); } /* * Rolling a checkpoint forward requires the avail list, the * blocks from which we can allocate. */ if (!checkpoint) WT_ERR(__wt_block_extlist_read_avail( session, block, &ci->avail, ci->file_size)); } /* * If the checkpoint can be written, that means anything written after * the checkpoint is no longer interesting, truncate the file. Don't * bother checking the avail list for a block at the end of the file, * that was done when the checkpoint was first written (re-writing the * checkpoint might possibly make it relevant here, but it's unlikely * enough I don't bother). */ if (!checkpoint) WT_ERR(__wt_block_truncate(session, block, ci->file_size)); if (0) { err: /* * Don't call checkpoint-unload: unload does real work including * file truncation. If we fail early enough that the checkpoint * information isn't correct, bad things would happen. The only * allocated memory was in the service of verify, clean that up. */ if (block->verify) WT_TRET(__wt_verify_ckpt_unload(session, block)); } /* Checkpoints don't need the original information, discard it. */ if (checkpoint && ci != NULL) __wt_block_ckpt_destroy(session, ci); __wt_scr_free(session, &tmp); return (ret); }