Beispiel #1
0
/*
 * __wt_curstat_cache_walk --
 *	Initialize the statistics for a cache cache_walk pass.
 */
void
__wt_curstat_cache_walk(WT_SESSION_IMPL *session)
{
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_PAGE_INDEX *root_idx;

	btree = S2BT(session);
	conn = S2C(session);

	/* Set statistics that don't require walking the cache. */
	WT_STAT_DATA_SET(session,
	    cache_state_gen_current, conn->cache->evict_pass_gen);

	/* Root page statistics */
	root_idx = WT_INTL_INDEX_GET_SAFE(btree->root.page);
	WT_STAT_DATA_SET(session,
	    cache_state_root_entries, root_idx->entries);
	WT_STAT_DATA_SET(session,
	    cache_state_root_size, btree->root.page->memory_footprint);

	WT_WITH_HANDLE_LIST_LOCK(session, __evict_stat_walk(session));
}
Beispiel #2
0
/*
 * __backup_log_append --
 *	Append log files needed for backup.
 */
static int
__backup_log_append(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, int active)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	u_int i, logcount;
	char **logfiles;

	conn = S2C(session);
	logfiles = NULL;
	logcount = 0;
	ret = 0;

	if (conn->log) {
		WT_ERR(__wt_log_get_all_files(
		    session, &logfiles, &logcount, &cb->maxid, active));
		for (i = 0; i < logcount; i++)
			WT_ERR(__backup_list_append(session, cb, logfiles[i]));
	}
err:	if (logfiles != NULL)
		__wt_log_files_free(session, logfiles, logcount);
	return (ret);
}
Beispiel #3
0
/*
 * __wt_txn_init --
 *	Initialize a session's transaction data.
 */
int
__wt_txn_init(WT_SESSION_IMPL *session)
{
	WT_TXN *txn;

	txn = &session->txn;
	txn->id = WT_TXN_NONE;

	WT_RET(__wt_calloc_def(session,
	    S2C(session)->session_size, &txn->snapshot));

	/*
	 * Take care to clean these out in case we are reusing the transaction
	 * for eviction.
	 */
	txn->mod = NULL;
	txn->modref = NULL;

	/* The default isolation level is read-committed. */
	txn->isolation = session->isolation = TXN_ISO_READ_COMMITTED;

	return (0);
}
Beispiel #4
0
/*读取配置信息,将配置信息设置到当前conn的cache信息*/
int __wt_cache_config(WT_SESSION_IMPL *session, int reconfigure, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	int now_shared, was_shared;

	conn = S2C(session);

	WT_ASSERT(session, conn->cache != NULL);

	WT_RET(__wt_config_gets_none(session, cfg, "shared_cache.name", &cval));
	now_shared = cval.len != 0;
	was_shared = F_ISSET(conn, WT_CONN_CACHE_POOL);

	/* Cleanup if reconfiguring */
	if (reconfigure && was_shared && !now_shared)
		/* Remove ourselves from the pool if necessary */
		WT_RET(__wt_conn_cache_pool_destroy(session)); /*如果原来是cache pool管理connection cache,现在的配置设置成独立的cache管理,那么从cache pool中删除管理关系*/
	else if (reconfigure && !was_shared && now_shared)
		/*
		* Cache size will now be managed by the cache pool - the
		* start size always needs to be zero to allow the pool to
		* manage how much memory is in-use.
		*/
		conn->cache_size = 0;

	/*配置connection的cache*/
	WT_RET(__cache_config_local(session, now_shared, cfg));
	if (now_shared) {
		WT_RET(__wt_cache_pool_config(session, cfg)); /*对cache pool的配置更新*/
		WT_ASSERT(session, F_ISSET(conn, WT_CONN_CACHE_POOL));
		if (!was_shared)
			WT_RET(__wt_conn_cache_pool_open(session)); /*将connection cache加入到cache pool当中进行管理*/
	}

	return 0;
}
Beispiel #5
0
/*
 * __wt_spin_lock_register_caller --
 *	Register a spin-lock caller's location information in the blocking
 * matrix.
 */
int
__wt_spin_lock_register_caller(WT_SESSION_IMPL *session,
    const char *name, const char *file, int line, int *idp)
{
	WT_CONNECTION_IMPL *conn;
	WT_CONNECTION_STATS_SPINLOCK *p;

	conn = S2C(session);

	/*
	 * The caller's location ID is a static offset into a per-connection
	 * structure, and that has problems: first, if there are multiple
	 * connections, we'll need to hold some kind of lock to avoid racing
	 * when setting that value, and second, if/when there are multiple
	 * connections and/or a single connection is closed and re-opened, the
	 * variable may be initialized and underlying connection information
	 * may not.
	 *
	 * First, allocate a location ID if needed.
	 */
	WT_RET(__spin_lock_next_id(session, idp));

	/*
	 * Add the caller's information to the blocking matrix.  We could race
	 * here (if two threads of control register the same lock at the same
	 * time), but we don't care as both threads are setting the identical
	 * information.
	 */
	p = &conn->spinlock_block[*idp];
	p->name = name;
	if ((p->file = strrchr(file, '/')) == NULL)
		p->file = file;
	else
		++p->file;
	p->line = line;
	return (0);
}
Beispiel #6
0
/*
 * __wt_las_create --
 *	Initialize the database's lookaside store.
 */
int
__wt_las_create(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	const char *drop_cfg[] = {
	    WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL };

	conn = S2C(session);

	/*
	 * Done at startup: we cannot do it on demand because we require the
	 * schema lock to create and drop the file, and it may not always be
	 * available.
	 *
	 * Open an internal session, used for the shared lookaside cursor.
	 *
	 * Sessions associated with a lookaside cursor should never be tapped
	 * for eviction.
	 */
	WT_RET(__wt_open_internal_session(
	    conn, "lookaside table", 1, 1, &conn->las_session));
	session = conn->las_session;
	F_SET(session, WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION);

	/* Discard any previous incarnation of the file. */
	WT_RET(__wt_session_drop(session, WT_LAS_URI, drop_cfg));

	/* Re-create the file. */
	WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT));

	/* Open the shared cursor. */
	WT_WITHOUT_DHANDLE(session,
	    ret = __las_cursor_create(session, &conn->las_cursor));

	return (ret);
}
Beispiel #7
0
/*
 * __wt_optrack_record_funcid --
 *	Allocate and record optrack function ID.
 */
void
__wt_optrack_record_funcid(
    WT_SESSION_IMPL *session, const char *func, uint16_t *func_idp)
{
	static uint16_t optrack_uid = 0; /* Unique for the process lifetime. */
	WT_CONNECTION_IMPL *conn;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	wt_off_t fsize;
	bool locked;

	conn = S2C(session);
	locked = false;

	WT_ERR(__wt_scr_alloc(session, strlen(func) + 32, &tmp));

	__wt_spin_lock(session, &conn->optrack_map_spinlock);
	locked = true;
	if (*func_idp == 0) {
		*func_idp = ++optrack_uid;

		WT_ERR(__wt_buf_fmt(
		    session, tmp, "%" PRIu16 " %s\n", *func_idp, func));
		WT_ERR(__wt_filesize(session, conn->optrack_map_fh, &fsize));
		WT_ERR(__wt_write(session,
		    conn->optrack_map_fh, fsize, tmp->size, tmp->data));
	}

	if (0) {
err:		WT_PANIC_MSG(session, ret,
		    "operation tracking initialization failure");
	}

	if (locked)
		__wt_spin_unlock(session, &conn->optrack_map_spinlock);
	__wt_scr_free(session, &tmp);
}
Beispiel #8
0
/*
 * __wt_checkpoint_server_destroy --
 *	Destroy the checkpoint server thread.
 */
int
__wt_checkpoint_server_destroy(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_SESSION *wt_session;

	conn = S2C(session);

	F_CLR(conn, WT_CONN_SERVER_CHECKPOINT);
	if (conn->ckpt_tid_set) {
		WT_TRET(__wt_cond_signal(session, conn->ckpt_cond));
		WT_TRET(__wt_thread_join(session, conn->ckpt_tid));
		conn->ckpt_tid_set = 0;
	}
	WT_TRET(__wt_cond_destroy(session, &conn->ckpt_cond));

	__wt_free(session, conn->ckpt_config);

	/* Close the server thread's session. */
	if (conn->ckpt_session != NULL) {
		wt_session = &conn->ckpt_session->iface;
		WT_TRET(wt_session->close(wt_session, NULL));
	}

	/*
	 * Ensure checkpoint settings are cleared - so that reconfigure doesn't
	 * get confused.
	 */
	conn->ckpt_session = NULL;
	conn->ckpt_tid_set = 0;
	conn->ckpt_cond = NULL;
	conn->ckpt_config = NULL;
	conn->ckpt_usecs = 0;

	return (ret);
}
/*
 * __wt_session_fotxn_add --
 *	Add a new entry into the session's free-on-transaction generation list.
 */
int
__wt_session_fotxn_add(WT_SESSION_IMPL *session, void *p, size_t len)
{
	WT_FOTXN *fotxn;
	size_t i;

	/*
	 * Make sure the current thread has a transaction pinned so that
	 * we don't immediately free the memory we are stashing.
	 */
	WT_ASSERT(session,
	    WT_SESSION_TXN_STATE(session)->snap_min != WT_TXN_NONE);

	/* Grow the list as necessary. */
	WT_RET(__wt_realloc_def(session,
	    &session->fotxn_size, session->fotxn_cnt + 1, &session->fotxn));

	/* Find an empty slot. */
	for (i = 0, fotxn = session->fotxn;
	    i < session->fotxn_size / sizeof(session->fotxn[0]);  ++i, ++fotxn)
		if (fotxn->p == NULL) {
			fotxn->txnid = S2C(session)->txn_global.current + 1;
			WT_ASSERT(session,
			    !__wt_txn_visible_all(session, fotxn->txnid));
			fotxn->p = p;
			fotxn->len = len;
			break;
		}
	++session->fotxn_cnt;

	/* See if we can free any previous entries. */
	if (session->fotxn_cnt > 1)
		__wt_session_fotxn_discard(session, session, 0);

	return (0);
}
Beispiel #10
0
/*
 * __wt_block_close --
 *	Close a block handle.
 */
int
__wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;

	if (block == NULL)				/* Safety check */
		return (0);

	conn = S2C(session);

	WT_TRET(__wt_verbose(session, WT_VERB_BLOCK,
	    "close: %s", block->name == NULL ? "" : block->name ));

	__wt_spin_lock(session, &conn->block_lock);

			/* Reference count is initialized to 1. */
	if (block->ref == 0 || --block->ref == 0)
		WT_TRET(__block_destroy(session, block));

	__wt_spin_unlock(session, &conn->block_lock);

	return (ret);
}
Beispiel #11
0
/*
 * __ckpt_server_config --
 *	Parse and setup the checkpoint server options.
 */
static int
__ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, int *startp)
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;

	conn = S2C(session);

	/*
	 * The checkpoint configuration requires a wait time -- if it's not set,
	 * we're not running at all.
	 */
	WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval));
	if (cval.val == 0) {
		*startp = 0;
		return (0);
	}
	conn->ckpt_usecs = (long)cval.val * 1000000;
	*startp = 1;

	WT_RET(__wt_config_gets(session, cfg, "checkpoint.name", &cval));

	if (!WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) {
		WT_RET(__wt_scr_alloc(session, cval.len + 20, &tmp));
		strcpy((char *)tmp->data, "name=");
		strncat((char *)tmp->data, cval.str, cval.len);
		ret = __wt_strndup(session,
		    tmp->data, strlen("name=") + cval.len, &conn->ckpt_config);
		__wt_scr_free(&tmp);
		WT_RET(ret);
	}

	return (0);
}
Beispiel #12
0
/*
 * __wt_log_slot_close --
 *	Close a slot and do not allow any other threads to join this slot.
 *	Remove this from the active slot array and move a new slot from
 *	the pool into its place.  Set up the size of this group;
 *	Must be called with the logging spinlock held.
 */
int
__wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;
	WT_LOGSLOT *newslot;
	int64_t old_state;

	conn = S2C(session);
	log = conn->log;
	/*
	 * Find an unused slot in the pool.
	 */
	WT_RET(__log_slot_find_free(session, &newslot));

	/*
	 * Swap out the slot we're going to use and put a free one in the
	 * slot array in its place so that threads can use it right away.
	 */
	WT_STAT_FAST_CONN_INCR(session, log_slot_closes);
	newslot->slot_state = WT_LOG_SLOT_READY;
	newslot->slot_index = slot->slot_index;
	log->slot_array[newslot->slot_index] = newslot;
	old_state = WT_ATOMIC_STORE8(slot->slot_state, WT_LOG_SLOT_PENDING);
	slot->slot_group_size = (uint64_t)(old_state - WT_LOG_SLOT_READY);
	/*
	 * Note that this statistic may be much bigger than in reality,
	 * especially when compared with the total bytes written in
	 * __log_fill.  The reason is that this size reflects any
	 * rounding up that is needed and the total bytes in __log_fill
	 * is the amount of user bytes.
	 */
	WT_STAT_FAST_CONN_INCRV(session,
	    log_slot_consolidated, (uint64_t)slot->slot_group_size);
	return (0);
}
Beispiel #13
0
/*
 * __wt_free_int --
 *	ANSI free function.
 */
void
__wt_free_int(WT_SESSION_IMPL *session, void *p_arg)
{
	void *p;

	/*
	 * !!!
	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
	 */
	if (session != NULL && S2C(session)->stats != NULL)
		WT_CSTAT_INCR(session, memfree);

	/*
	 * If there's a serialization bug we might race with another thread.
	 * We can't avoid the race (and we aren't willing to flush memory),
	 * but we minimize the window by clearing the free address atomically,
	 * hoping a racing thread will see, and won't free, a NULL pointer.
	 */
	p = *(void **)p_arg;
	*(void **)p_arg = NULL;

	if (p != NULL)			/* ANSI C free semantics */
		free(p);
}
Beispiel #14
0
/*
 * __metadata_load_hot_backup --
 *	Load the contents of any hot backup file.
 */
static int
__metadata_load_hot_backup(WT_SESSION_IMPL *session)
{
	WT_DECL_ITEM(key);
	WT_DECL_ITEM(value);
	WT_DECL_RET;
	WT_FSTREAM *fs;
	bool exist;

	/* Look for a hot backup file: if we find it, load it. */
	WT_RET(__wt_fs_exist(session, WT_METADATA_BACKUP, &exist));
	if (!exist)
		return (0);
	WT_RET(__wt_fopen(session,
	    WT_METADATA_BACKUP, 0, WT_STREAM_READ, &fs));

	/* Read line pairs and load them into the metadata file. */
	WT_ERR(__wt_scr_alloc(session, 512, &key));
	WT_ERR(__wt_scr_alloc(session, 512, &value));
	for (;;) {
		WT_ERR(__wt_getline(session, fs, key));
		if (key->size == 0)
			break;
		WT_ERR(__wt_getline(session, fs, value));
		if (value->size == 0)
			WT_ERR(__wt_illegal_value(session, WT_METADATA_BACKUP));
		WT_ERR(__wt_metadata_update(session, key->data, value->data));
	}

	F_SET(S2C(session), WT_CONN_WAS_BACKUP);

err:	WT_TRET(__wt_fclose(session, &fs));
	__wt_scr_free(session, &key);
	__wt_scr_free(session, &value);
	return (ret);
}
Beispiel #15
0
/*
 * __log_slot_dump --
 *	Dump the entire slot state.
 */
static void
__log_slot_dump(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;
	WT_LOGSLOT *slot;
	int earliest, i;

	conn = S2C(session);
	log = conn->log;
	earliest = 0;
	for (i = 0; i < WT_SLOT_POOL; i++) {
		slot = &log->slot_pool[i];
		if (__wt_log_cmp(&slot->slot_release_lsn,
		    &log->slot_pool[earliest].slot_release_lsn) < 0)
			earliest = i;
		__wt_errx(session, "Slot %d:", i);
		__wt_errx(session, "    State: %" PRIx64 " Flags: %" PRIx32,
		    (uint64_t)slot->slot_state, slot->flags);
		__wt_errx(session, "    Start LSN: %" PRIu32 "/%" PRIu32,
		    slot->slot_start_lsn.l.file, slot->slot_start_lsn.l.offset);
		__wt_errx(session, "    End  LSN: %" PRIu32 "/%" PRIu32,
		    slot->slot_end_lsn.l.file, slot->slot_end_lsn.l.offset);
		__wt_errx(session, "    Release LSN: %" PRIu32 "/%" PRIu32,
		    slot->slot_release_lsn.l.file,
		    slot->slot_release_lsn.l.offset);
		__wt_errx(session, "    Offset: start: %" PRIuMAX
		    " last:%" PRIuMAX, (uintmax_t)slot->slot_start_offset,
		    (uintmax_t)slot->slot_last_offset);
		__wt_errx(session, "    Unbuffered: %" PRId64
		    " error: %" PRId32, slot->slot_unbuffered,
		    slot->slot_error);
	}
	__wt_errx(session, "Earliest slot: %d", earliest);

}
Beispiel #16
0
/*
 * __wt_las_cursor --
 *	Return a lookaside cursor.
 */
void
__wt_las_cursor(
    WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags)
{
	WT_CONNECTION_IMPL *conn;

	*cursorp = NULL;

	/*
	 * We don't want to get tapped for eviction after we start using the
	 * lookaside cursor; save a copy of the current eviction state, we'll
	 * turn eviction off before we return.
	 *
	 * Don't cache lookaside table pages, we're here because of eviction
	 * problems and there's no reason to believe lookaside pages will be
	 * useful more than once.
	 */
	*session_flags =
	    F_MASK(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);

	conn = S2C(session);

	/*
	 * Some threads have their own lookaside table cursors, else lock the
	 * shared lookaside cursor.
	 */
	if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR))
		*cursorp = session->las_cursor;
	else {
		__wt_spin_lock(session, &conn->las_lock);
		*cursorp = conn->las_session->las_cursor;
	}

	/* Turn caching and eviction off. */
	F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
}
Beispiel #17
0
int __wt_schema_rename(WT_SESSION_IMPL *session, const char *uri, const char *newuri, const char *cfg[])
{
	WT_DATA_SOURCE *dsrc;
	WT_DECL_RET;
	const char *p, *t;

	/* The target type must match the source type. 匹配前面的关键字,例如file: table:*/
	for (p = uri, t = newuri; *p == *t && *p != ':'; ++p, ++t)
		;
	if (*p != ':' || *t != ':')
		WT_RET_MSG(session, EINVAL, "rename target type must match URI: %s to %s", uri, newuri);

	/*
	 * We track rename operations, if we fail in the middle, we want to
	 * back it all out.
	 */
	WT_RET(__wt_meta_track_on(session));

	if (WT_PREFIX_MATCH(uri, "file:"))
		ret = __rename_file(session, uri, newuri);
	else if (WT_PREFIX_MATCH(uri, "lsm:"))
		ret = __wt_lsm_tree_rename(session, uri, newuri, cfg);
	else if (WT_PREFIX_MATCH(uri, "table:"))
		ret = __rename_table(session, uri, newuri, cfg);
	else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
		ret = dsrc->rename == NULL ? __wt_object_unsupported(session, uri) : dsrc->rename(dsrc, &session->iface, uri, newuri, (WT_CONFIG_ARG *)cfg);
	else
		ret = __wt_bad_object_type(session, uri);

	/* Bump the schema generation so that stale data is ignored. */
	++S2C(session)->schema_gen;

	WT_TRET(__wt_meta_track_off(session, 1, ret != 0));

	return (ret == WT_NOTFOUND ? ENOENT : ret);
}
Beispiel #18
0
/*
 * __wt_spin_lock_unregister_lock --
 *	Remove a lock from the connection's list.
 */
void
__wt_spin_lock_unregister_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
{
	WT_CONNECTION_IMPL *conn;
	u_int i;

	conn = S2C(session);

	for (i = 0; i < WT_SPINLOCK_MAX; i++)
		if (conn->spinlock_list[i] == t)
			    conn->spinlock_list[i] = NULL;

	/*
	 * XXX
	 * The statistics thread reads through this array, there's a possible
	 * race: if that thread reads the pointer then goes to sleep, then we
	 * free the spinlock, then the statistics thread wakes up, it can read
	 * free'd memory.
	 *
	 * This is performance debugging code, so we're not fixing the race for
	 * now, minimize the window.
	 */
	WT_FULL_BARRIER();
}
Beispiel #19
0
/*根据cache的状态更新cache的统计信息*/
void __wt_cache_stats_update(WT_SESSION_IMPL *session)
{
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_CONNECTION_STATS *stats;

	conn = S2C(session);
	cache = conn->cache;
	stats = &conn->stats;

	WT_STAT_SET(stats, cache_bytes_max, conn->cache_size);
	WT_STAT_SET(stats, cache_bytes_inuse, __wt_cache_bytes_inuse(cache));

	WT_STAT_SET(stats, cache_overhead, cache->overhead_pct);
	WT_STAT_SET(stats, cache_pages_inuse, __wt_cache_pages_inuse(cache));
	WT_STAT_SET(stats, cache_bytes_dirty, __wt_cache_dirty_inuse(cache));
	WT_STAT_SET(stats, cache_eviction_maximum_page_size, cache->evict_max_page_size);
	WT_STAT_SET(stats, cache_pages_dirty, cache->pages_dirty);

	/* Figure out internal, leaf and overflow stats */
	WT_STAT_SET(stats, cache_bytes_internal, cache->bytes_internal);
	WT_STAT_SET(stats, cache_bytes_leaf, conn->cache_size - (cache->bytes_internal + cache->bytes_overflow));
	WT_STAT_SET(stats, cache_bytes_overflow, cache->bytes_overflow);
}
Beispiel #20
0
/*
 * __logmgr_sync_cfg --
 *	Interpret the transaction_sync config.
 */
static int
__logmgr_sync_cfg(WT_SESSION_IMPL *session, const char **cfg)
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;

	conn = S2C(session);

	WT_RET(
	    __wt_config_gets(session, cfg, "transaction_sync.enabled", &cval));
	if (cval.val)
		FLD_SET(conn->txn_logsync, WT_LOG_FLUSH);
	else
		FLD_CLR(conn->txn_logsync, WT_LOG_FLUSH);

	WT_RET(
	    __wt_config_gets(session, cfg, "transaction_sync.method", &cval));
	FLD_CLR(conn->txn_logsync, WT_LOG_DSYNC | WT_LOG_FSYNC);
	if (WT_STRING_MATCH("dsync", cval.str, cval.len))
		FLD_SET(conn->txn_logsync, WT_LOG_DSYNC);
	else if (WT_STRING_MATCH("fsync", cval.str, cval.len))
		FLD_SET(conn->txn_logsync, WT_LOG_FSYNC);
	return (0);
}
Beispiel #21
0
/*
 * __wt_logmgr_destroy --
 *	Destroy the log archiving server thread and logging subsystem.
 */
int
__wt_logmgr_destroy(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_SESSION *wt_session;

	conn = S2C(session);

	if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) {
		/*
		 * We always set up the log_path so printlog can work without
		 * recovery. Therefore, always free it, even if logging isn't
		 * on.
		 */
		__wt_free(session, conn->log_path);
		return (0);
	}
	if (conn->log_tid_set) {
		WT_TRET(__wt_cond_signal(session, conn->log_cond));
		WT_TRET(__wt_thread_join(session, conn->log_tid));
		conn->log_tid_set = 0;
	}
	if (conn->log_file_tid_set) {
		WT_TRET(__wt_cond_signal(session, conn->log_file_cond));
		WT_TRET(__wt_thread_join(session, conn->log_file_tid));
		conn->log_file_tid_set = 0;
	}
	if (conn->log_file_session != NULL) {
		wt_session = &conn->log_file_session->iface;
		WT_TRET(wt_session->close(wt_session, NULL));
		conn->log_file_session = NULL;
	}
	if (conn->log_wrlsn_tid_set) {
		WT_TRET(__wt_cond_signal(session, conn->log_wrlsn_cond));
		WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid));
		conn->log_wrlsn_tid_set = 0;
	}
	if (conn->log_wrlsn_session != NULL) {
		wt_session = &conn->log_wrlsn_session->iface;
		WT_TRET(wt_session->close(wt_session, NULL));
		conn->log_wrlsn_session = NULL;
	}

	WT_TRET(__wt_log_slot_destroy(session));
	WT_TRET(__wt_log_close(session));

	/* Close the server thread's session. */
	if (conn->log_session != NULL) {
		wt_session = &conn->log_session->iface;
		WT_TRET(wt_session->close(wt_session, NULL));
		conn->log_session = NULL;
	}

	/* Destroy the condition variables now that all threads are stopped */
	WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
	WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond));
	WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond));

	WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond));
	WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond));
	WT_TRET(__wt_rwlock_destroy(session, &conn->log->log_archive_lock));
	__wt_spin_destroy(session, &conn->log->log_lock);
	__wt_spin_destroy(session, &conn->log->log_slot_lock);
	__wt_spin_destroy(session, &conn->log->log_sync_lock);
	__wt_spin_destroy(session, &conn->log->log_writelsn_lock);
	__wt_free(session, conn->log_path);
	__wt_free(session, conn->log);
	return (ret);
}
Beispiel #22
0
/*
 * __wt_logmgr_open --
 *	Start the log service threads.
 */
int
__wt_logmgr_open(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;

	conn = S2C(session);

	/* If no log thread services are configured, we're done. */ 
	if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
		return (0);

	/*
	 * Start the log close thread.  It is not configurable.
	 * If logging is enabled, this thread runs.
	 */
	WT_RET(__wt_open_internal_session(
	    conn, "log-close-server", 0, 0, &conn->log_file_session));
	WT_RET(__wt_cond_alloc(conn->log_file_session,
	    "log close server", 0, &conn->log_file_cond));

	/*
	 * Start the log file close thread.
	 */
	WT_RET(__wt_thread_create(conn->log_file_session,
	    &conn->log_file_tid, __log_file_server, conn->log_file_session));
	conn->log_file_tid_set = 1;

	/*
	 * Start the log write LSN thread.  It is not configurable.
	 * If logging is enabled, this thread runs.
	 */
	WT_RET(__wt_open_internal_session(
	    conn, "log-wrlsn-server", 0, 0, &conn->log_wrlsn_session));
	WT_RET(__wt_cond_alloc(conn->log_wrlsn_session,
	    "log write lsn server", 0, &conn->log_wrlsn_cond));
	WT_RET(__wt_thread_create(conn->log_wrlsn_session,
	    &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session));
	conn->log_wrlsn_tid_set = 1;

	/* If no log thread services are configured, we're done. */ 
	if (!FLD_ISSET(conn->log_flags,
	    (WT_CONN_LOG_ARCHIVE | WT_CONN_LOG_PREALLOC)))
		return (0);

	/*
	 * If a log server thread exists, the user may have reconfigured
	 * archiving or pre-allocation.  Signal the thread.  Otherwise the
	 * user wants archiving and/or allocation and we need to start up
	 * the thread.
	 */
	if (conn->log_session != NULL) {
		WT_ASSERT(session, conn->log_cond != NULL);
		WT_ASSERT(session, conn->log_tid_set != 0);
		WT_RET(__wt_cond_signal(session, conn->log_cond));
	} else {
		/* The log server gets its own session. */
		WT_RET(__wt_open_internal_session(
		    conn, "log-server", 0, 0, &conn->log_session));
		WT_RET(__wt_cond_alloc(conn->log_session,
		    "log server", 0, &conn->log_cond));

		/*
		 * Start the thread.
		 */
		WT_RET(__wt_thread_create(conn->log_session,
		    &conn->log_tid, __log_server, conn->log_session));
		conn->log_tid_set = 1;
	}

	return (0);
}
Beispiel #23
0
/*
 * __log_server --
 *	The log server thread.
 */
static WT_THREAD_RET
__log_server(void *arg)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	WT_SESSION_IMPL *session;
	int freq_per_sec, signalled;

	session = arg;
	conn = S2C(session);
	log = conn->log;
	signalled = 0;

	/*
	 * Set this to the number of times per second we want to force out the
	 * log slot buffer.
	 */
#define	WT_FORCE_PER_SECOND	20
	freq_per_sec = WT_FORCE_PER_SECOND;

	/*
	 * The log server thread does a variety of work.  It forces out any
	 * buffered log writes.  It pre-allocates log files and it performs
	 * log archiving.  The reason the wrlsn thread does not force out
	 * the buffered writes is because we want to process and move the
	 * write_lsn forward as quickly as possible.  The same reason applies
	 * to why the log file server thread does not force out the writes.
	 * That thread does fsync calls which can take a long time and we
	 * don't want log records sitting in the buffer over the time it
	 * takes to sync out an earlier file.
	 */
	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
		/*
		 * Slots depend on future activity.  Force out buffered
		 * writes in case we are idle.  This cannot be part of the
		 * wrlsn thread because of interaction advancing the write_lsn
		 * and a buffer may need to wait for the write_lsn to advance
		 * in the case of a synchronous buffer.  We end up with a hang.
		 */
		WT_ERR_BUSY_OK(__wt_log_force_write(session, 0));

		/*
		 * We don't want to archive or pre-allocate files as often as
		 * we want to force out log buffers.  Only do it once per second
		 * or if the condition was signalled.
		 */
		if (--freq_per_sec <= 0 || signalled != 0) {
			freq_per_sec = WT_FORCE_PER_SECOND;

			/*
			 * Perform log pre-allocation.
			 */
			if (conn->log_prealloc > 0)
				WT_ERR(__log_prealloc_once(session));

			/*
			 * Perform the archive.
			 */
			if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) {
				if (__wt_try_writelock(
				    session, log->log_archive_lock) == 0) {
					ret = __log_archive_once(session, 0);
					WT_TRET(__wt_writeunlock(
					    session, log->log_archive_lock));
					WT_ERR(ret);
				} else
					WT_ERR(
					    __wt_verbose(session, WT_VERB_LOG,
					    "log_archive: Blocked due to open "
					    "log cursor holding archive lock"));
			}
		}

		/* Wait until the next event. */
		WT_ERR(__wt_cond_wait_signal(session, conn->log_cond,
		    WT_MILLION / WT_FORCE_PER_SECOND, &signalled));
	}

	if (0) {
err:		__wt_err(session, ret, "log server error");
	}
	return (WT_THREAD_RET_VALUE);
}
Beispiel #24
0
/*
 * __wt_log_wrlsn --
 *	Process written log slots and attempt to coalesce them if the LSNs
 *	are contiguous.  The purpose of this function is to advance the
 *	write_lsn in LSN order after the buffer is written to the log file.
 */
int
__wt_log_wrlsn(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL];
	WT_LOGSLOT *coalescing, *slot;
	WT_LSN save_lsn;
	size_t written_i;
	uint32_t i, save_i;

	conn = S2C(session);
	log = conn->log;
	__wt_spin_lock(session, &log->log_writelsn_lock);
restart:
	coalescing = NULL;
	WT_INIT_LSN(&save_lsn);
	written_i = 0;
	i = 0;

	/*
	 * Walk the array once saving any slots that are in the
	 * WT_LOG_SLOT_WRITTEN state.
	 */
	while (i < WT_SLOT_POOL) {
		save_i = i;
		slot = &log->slot_pool[i++];
		/*
		 * XXX - During debugging I saw slot 0 become orphaned.
		 * I believe it is fixed, but check for now.
		 * This assertion should catch that.
		 */
		if (slot->slot_state == 0)
			WT_ASSERT(session,
			    slot->slot_release_lsn.file >= log->write_lsn.file);
		if (slot->slot_state != WT_LOG_SLOT_WRITTEN)
			continue;
		written[written_i].slot_index = save_i;
		written[written_i++].lsn = slot->slot_release_lsn;
	}
	/*
	 * If we found any written slots process them.  We sort them
	 * based on the release LSN, and then look for them in order.
	 */
	if (written_i > 0) {
		WT_INSERTION_SORT(written, written_i,
		    WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT);
		/*
		 * We know the written array is sorted by LSN.  Go
		 * through them either advancing write_lsn or coalesce
		 * contiguous ranges of written slots.
		 */
		for (i = 0; i < written_i; i++) {
			slot = &log->slot_pool[written[i].slot_index];
			/*
			 * The log server thread pushes out slots periodically.
			 * Sometimes they are empty slots.  If we find an
			 * empty slot, where empty means the start and end LSN
			 * are the same, free it and continue.
			 */
			if (__wt_log_cmp(&slot->slot_start_lsn,
			    &slot->slot_release_lsn) == 0 &&
			    __wt_log_cmp(&slot->slot_start_lsn,
			    &slot->slot_end_lsn) == 0) {
				__wt_log_slot_free(session, slot);
				continue;
			}
			if (coalescing != NULL) {
				/*
				 * If the write_lsn changed, we may be able to
				 * process slots.  Try again.
				 */
				if (__wt_log_cmp(
				    &log->write_lsn, &save_lsn) != 0)
					goto restart;
				if (__wt_log_cmp(&coalescing->slot_end_lsn,
				    &written[i].lsn) != 0) {
					coalescing = slot;
					continue;
				}
				/*
				 * If we get here we have a slot to coalesce
				 * and free.
				 */
				coalescing->slot_last_offset =
				    slot->slot_last_offset;
				coalescing->slot_end_lsn = slot->slot_end_lsn;
				WT_STAT_FAST_CONN_INCR(
				    session, log_slot_coalesced);
				/*
				 * Copy the flag for later closing.
				 */
				if (F_ISSET(slot, WT_SLOT_CLOSEFH))
					F_SET(coalescing, WT_SLOT_CLOSEFH);
			} else {
				/*
				 * If this written slot is not the next LSN,
				 * try to start coalescing with later slots.
				 * A synchronous write may update write_lsn
				 * so save the last one we saw to check when
				 * coalescing slots.
				 */
				save_lsn = log->write_lsn;
				if (__wt_log_cmp(
				    &log->write_lsn, &written[i].lsn) != 0) {
					coalescing = slot;
					continue;
				}
				/*
				 * If we get here we have a slot to process.
				 * Advance the LSN and process the slot.
				 */
				WT_ASSERT(session, __wt_log_cmp(&written[i].lsn,
				    &slot->slot_release_lsn) == 0);
				if (slot->slot_start_lsn.offset !=
				    slot->slot_last_offset)
					slot->slot_start_lsn.offset =
					    slot->slot_last_offset;
				log->write_start_lsn = slot->slot_start_lsn;
				log->write_lsn = slot->slot_end_lsn;
				WT_ERR(__wt_cond_signal(
				    session, log->log_write_cond));
				WT_STAT_FAST_CONN_INCR(session, log_write_lsn);
				/*
				 * Signal the close thread if needed.
				 */
				if (F_ISSET(slot, WT_SLOT_CLOSEFH))
					WT_ERR(__wt_cond_signal(
					    session, conn->log_file_cond));
			}
			__wt_log_slot_free(session, slot);
		}
	}
err:	__wt_spin_unlock(session, &log->log_writelsn_lock);
	return (ret);
}
Beispiel #25
0
/*
 * __log_file_server --
 *	The log file server thread.  This worker thread manages
 *	log file operations such as closing and syncing.
 */
static WT_THREAD_RET
__log_file_server(void *arg)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *close_fh;
	WT_LOG *log;
	WT_LSN close_end_lsn, min_lsn;
	WT_SESSION_IMPL *session;
	uint32_t filenum;
	int locked;

	session = arg;
	conn = S2C(session);
	log = conn->log;
	locked = 0;
	while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
		/*
		 * If there is a log file to close, make sure any outstanding
		 * write operations have completed, then fsync and close it.
		 */
		if ((close_fh = log->log_close_fh) != NULL) {
			WT_ERR(__wt_log_extract_lognum(session, close_fh->name,
			    &filenum));
			/*
			 * We update the close file handle before updating the
			 * close LSN when changing files.  It is possible we
			 * could see mismatched settings.  If we do, yield
			 * until it is set.  This should rarely happen.
			 */
			while (log->log_close_lsn.file < filenum)
				__wt_yield();

			if (__wt_log_cmp(
			    &log->write_lsn, &log->log_close_lsn) >= 0) {
				/*
				 * We've copied the file handle, clear out the
				 * one in the log structure to allow it to be
				 * set again.  Copy the LSN before clearing
				 * the file handle.
				 * Use a barrier to make sure the compiler does
				 * not reorder the following two statements.
				 */
				close_end_lsn = log->log_close_lsn;
				WT_FULL_BARRIER();
				log->log_close_fh = NULL;
				/*
				 * Set the close_end_lsn to the LSN immediately
				 * after ours.  That is, the beginning of the
				 * next log file.   We need to know the LSN
				 * file number of our own close in case earlier
				 * calls are still in progress and the next one
				 * to move the sync_lsn into the next file for
				 * later syncs.
				 */
				close_end_lsn.file++;
				close_end_lsn.offset = 0;
				WT_ERR(__wt_fsync(session, close_fh));
				__wt_spin_lock(session, &log->log_sync_lock);
				locked = 1;
				WT_ERR(__wt_close(session, &close_fh));
				WT_ASSERT(session, __wt_log_cmp(
				    &close_end_lsn, &log->sync_lsn) >= 0);
				log->sync_lsn = close_end_lsn;
				WT_ERR(__wt_cond_signal(
				    session, log->log_sync_cond));
				locked = 0;
				__wt_spin_unlock(session, &log->log_sync_lock);
			}
		}
		/*
		 * If a later thread asked for a background sync, do it now.
		 */
		if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) {
			/*
			 * Save the latest write LSN which is the minimum
			 * we will have written to disk.
			 */
			min_lsn = log->write_lsn;
			/*
			 * We have to wait until the LSN we asked for is
			 * written.  If it isn't signal the wrlsn thread
			 * to get it written.
			 */
			if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) {
				WT_ERR(__wt_fsync(session, log->log_fh));
				__wt_spin_lock(session, &log->log_sync_lock);
				locked = 1;
				/*
				 * The sync LSN could have advanced while we
				 * were writing to disk.
				 */
				if (__wt_log_cmp(
				    &log->sync_lsn, &min_lsn) <= 0) {
					log->sync_lsn = min_lsn;
					WT_ERR(__wt_cond_signal(
					    session, log->log_sync_cond));
				}
				locked = 0;
				__wt_spin_unlock(session, &log->log_sync_lock);
			} else {
				WT_ERR(__wt_cond_signal(
				    session, conn->log_wrlsn_cond));
				/*
				 * We do not want to wait potentially a second
				 * to process this.  Yield to give the wrlsn
				 * thread a chance to run and try again in
				 * this case.
				 */
				__wt_yield();
				continue;
			}
		}
		/* Wait until the next event. */
		WT_ERR(__wt_cond_wait(
		    session, conn->log_file_cond, WT_MILLION));
	}

	if (0) {
err:		__wt_err(session, ret, "log close server error");
	}
	if (locked)
		__wt_spin_unlock(session, &log->log_sync_lock);
	return (WT_THREAD_RET_VALUE);
}
Beispiel #26
0
/*
 * __log_archive_once --
 *	Perform one iteration of log archiving.  Must be called with the
 *	log archive lock held.
 */
static int
__log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	uint32_t lognum, min_lognum;
	u_int i, locked, logcount;
	char **logfiles;

	conn = S2C(session);
	log = conn->log;
	logcount = 0;
	logfiles = NULL;

	/*
	 * If we're coming from a backup cursor we want the smaller of
	 * the last full log file copied in backup or the checkpoint LSN.
	 * Otherwise we want the minimum of the last log file written to
	 * disk and the checkpoint LSN.
	 */
	if (backup_file != 0)
		min_lognum = WT_MIN(log->ckpt_lsn.file, backup_file);
	else
		min_lognum = WT_MIN(log->ckpt_lsn.file, log->sync_lsn.file);
	WT_RET(__wt_verbose(session, WT_VERB_LOG,
	    "log_archive: archive to log number %" PRIu32, min_lognum));

	/*
	 * Main archive code.  Get the list of all log files and
	 * remove any earlier than the minimum log number.
	 */
	WT_RET(__wt_dirlist(session, conn->log_path,
	    WT_LOG_FILENAME, WT_DIRLIST_INCLUDE, &logfiles, &logcount));

	/*
	 * We can only archive files if a hot backup is not in progress or
	 * if we are the backup.
	 */
	WT_RET(__wt_readlock(session, conn->hot_backup_lock));
	locked = 1;
	if (conn->hot_backup == 0 || backup_file != 0) {
		for (i = 0; i < logcount; i++) {
			WT_ERR(__wt_log_extract_lognum(
			    session, logfiles[i], &lognum));
			if (lognum < min_lognum)
				WT_ERR(__wt_log_remove(
				    session, WT_LOG_FILENAME, lognum));
		}
	}
	WT_ERR(__wt_readunlock(session, conn->hot_backup_lock));
	locked = 0;
	__wt_log_files_free(session, logfiles, logcount);
	logfiles = NULL;
	logcount = 0;

	/*
	 * Indicate what is our new earliest LSN.  It is the start
	 * of the log file containing the last checkpoint.
	 */
	log->first_lsn.file = min_lognum;
	log->first_lsn.offset = 0;

	if (0)
err:		__wt_err(session, ret, "log archive server error");
	if (locked)
		WT_TRET(__wt_readunlock(session, conn->hot_backup_lock));
	if (logfiles != NULL)
		__wt_log_files_free(session, logfiles, logcount);
	return (ret);
}
Beispiel #27
0
/*
 * __wt_open --
 *	Open a file handle.
 */
int
__wt_open(WT_SESSION_IMPL *session,
    const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp)
{
	DWORD dwCreationDisposition;
	HANDLE filehandle, filehandle_secondary;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *fh, *tfh;
	uint64_t bucket, hash;
	int direct_io, f, matched, share_mode;
	char *path;

	conn = S2C(session);
	fh = NULL;
	path = NULL;
	filehandle = INVALID_HANDLE_VALUE;
	filehandle_secondary = INVALID_HANDLE_VALUE;
	direct_io = 0;
	hash = __wt_hash_city64(name, strlen(name));
	bucket = hash % WT_HASH_ARRAY_SIZE;

	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name));

	/* Increment the reference count if we already have the file open. */
	matched = 0;
	__wt_spin_lock(session, &conn->fh_lock);
	SLIST_FOREACH(tfh, &conn->fhhash[bucket], l)
		if (strcmp(name, tfh->name) == 0) {
			++tfh->ref;
			*fhp = tfh;
			matched = 1;
			break;
		}
	__wt_spin_unlock(session, &conn->fh_lock);
	if (matched)
		return (0);

	/* For directories, create empty file handles with invalid handles */
	if (dio_type == WT_FILE_TYPE_DIRECTORY) {
		goto setupfh;
	}

	WT_RET(__wt_filename(session, name, &path));

	share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
	/*
	 * Security:
	 * The application may spawn a new process, and we don't want another
	 * process to have access to our file handles.
	 *
	 * TODO: Set tighter file permissions but set bInheritHandle to false
	 * to prevent inheritance
	 */

	f = FILE_ATTRIBUTE_NORMAL;

	dwCreationDisposition = 0;
	if (ok_create) {
		dwCreationDisposition = CREATE_NEW;
		if (exclusive)
			dwCreationDisposition = CREATE_ALWAYS;
	} else
		dwCreationDisposition = OPEN_EXISTING;

	if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) {
		f |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
		direct_io = 1;
	}

	if (dio_type == WT_FILE_TYPE_LOG &&
	    FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) {
		f |= FILE_FLAG_WRITE_THROUGH;
	}

	/* Disable read-ahead on trees: it slows down random read workloads. */
	if (dio_type == WT_FILE_TYPE_DATA ||
	    dio_type == WT_FILE_TYPE_CHECKPOINT)
		f |= FILE_FLAG_RANDOM_ACCESS;

	filehandle = CreateFileA(path,
				(GENERIC_READ | GENERIC_WRITE),
				share_mode,
				NULL,
				dwCreationDisposition,
				f,
				NULL);
	if (filehandle == INVALID_HANDLE_VALUE) {
		if (GetLastError() == ERROR_FILE_EXISTS && ok_create)
			filehandle = CreateFileA(path,
						(GENERIC_READ | GENERIC_WRITE),
						share_mode,
						NULL,
						OPEN_EXISTING,
						f,
						NULL);

		if (filehandle == INVALID_HANDLE_VALUE)
			WT_ERR_MSG(session, __wt_errno(),
			    direct_io ?
			    "%s: open failed with direct I/O configured, some "
			    "filesystem types do not support direct I/O" :
			    "%s", path);
	}

	/*
	 * Open a second handle to file to support allocation/truncation
	 * concurrently with reads on the file. Writes would also move the file
	 * pointer.
	 */
	filehandle_secondary = CreateFileA(path,
	    (GENERIC_READ | GENERIC_WRITE),
	    share_mode,
	    NULL,
	    OPEN_EXISTING,
	    f,
	    NULL);
	if (filehandle == INVALID_HANDLE_VALUE)
		WT_ERR_MSG(session, __wt_errno(),
		    "open failed for secondary handle: %s", path);

setupfh:
	WT_ERR(__wt_calloc_one(session, &fh));
	WT_ERR(__wt_strdup(session, name, &fh->name));
	fh->name_hash = hash;
	fh->filehandle = filehandle;
	fh->filehandle_secondary = filehandle_secondary;
	fh->ref = 1;
	fh->direct_io = direct_io;

	/* Set the file's size. */
	if (dio_type != WT_FILE_TYPE_DIRECTORY)
		WT_ERR(__wt_filesize(session, fh, &fh->size));

	/* Configure file extension. */
	if (dio_type == WT_FILE_TYPE_DATA ||
	    dio_type == WT_FILE_TYPE_CHECKPOINT)
		fh->extend_len = conn->data_extend_len;

	/* Configure fallocate/posix_fallocate calls. */
	__wt_fallocate_config(session, fh);

	/*
	 * Repeat the check for a match, but then link onto the database's list
	 * of files.
	 */
	matched = 0;
	__wt_spin_lock(session, &conn->fh_lock);
	SLIST_FOREACH(tfh, &conn->fhhash[bucket], l)
		if (strcmp(name, tfh->name) == 0) {
			++tfh->ref;
			*fhp = tfh;
			matched = 1;
			break;
		}
	if (!matched) {
		WT_CONN_FILE_INSERT(conn, fh, bucket);
		WT_STAT_FAST_CONN_INCR(session, file_open);

		*fhp = fh;
	}
	__wt_spin_unlock(session, &conn->fh_lock);
	if (matched) {
err:		if (fh != NULL) {
			__wt_free(session, fh->name);
			__wt_free(session, fh);
		}
		if (filehandle != INVALID_HANDLE_VALUE)
			(void)CloseHandle(filehandle);
		if (filehandle_secondary != INVALID_HANDLE_VALUE)
			(void)CloseHandle(filehandle_secondary);
	}

	__wt_free(session, path);
	return (ret);
}
Beispiel #28
0
/*
 * __wt_lsm_free_chunks --
 *	Try to drop chunks from the tree that are no longer required.
 */
int
__wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_WORKER_COOKIE cookie;
	u_int i, skipped;
	int drop_ret;
	bool flush_metadata;

	flush_metadata = false;

	if (lsm_tree->nold_chunks == 0)
		return (0);

	/*
	 * Make sure only a single thread is freeing the old chunk array
	 * at any time.
	 */
	if (!__wt_atomic_cas32(&lsm_tree->freeing_old_chunks, 0, 1))
		return (0);
	/*
	 * Take a copy of the current state of the LSM tree and look for chunks
	 * to drop.  We do it this way to avoid holding the LSM tree lock while
	 * doing I/O or waiting on the schema lock.
	 *
	 * This is safe because only one thread will be in this function at a
	 * time.  Merges may complete concurrently, and the old_chunks array
	 * may be extended, but we shuffle down the pointers each time we free
	 * one to keep the non-NULL slots at the beginning of the array.
	 */
	WT_CLEAR(cookie);
	WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, true));
	for (i = skipped = 0; i < cookie.nchunks; i++) {
		chunk = cookie.chunk_array[i];
		WT_ASSERT(session, chunk != NULL);
		/* Skip the chunk if another worker is using it. */
		if (chunk->refcnt > 1) {
			++skipped;
			continue;
		}

		/*
		 * Don't remove files if a hot backup is in progress.
		 *
		 * The schema lock protects the set of live files, this check
		 * prevents us from removing a file that hot backup already
		 * knows about.
		 */
		if (S2C(session)->hot_backup)
			break;

		/*
		 * Drop any bloom filters and chunks we can. Don't try to drop
		 * a chunk if the bloom filter drop fails.
		 *  An EBUSY return indicates that a cursor is still open in
		 *       the tree - move to the next chunk in that case.
		 * An ENOENT return indicates that the LSM tree metadata was
		 *       out of sync with the on disk state. Update the
		 *       metadata to match in that case.
		 */
		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
			drop_ret = __lsm_drop_file(session, chunk->bloom_uri);
			if (drop_ret == EBUSY) {
				++skipped;
				continue;
			} else if (drop_ret != ENOENT)
				WT_ERR(drop_ret);

			flush_metadata = true;
			F_CLR(chunk, WT_LSM_CHUNK_BLOOM);
		}
		if (chunk->uri != NULL) {
			drop_ret = __lsm_drop_file(session, chunk->uri);
			if (drop_ret == EBUSY) {
				++skipped;
				continue;
			} else if (drop_ret != ENOENT)
				WT_ERR(drop_ret);
			flush_metadata = true;
		}

		/* Lock the tree to clear out the old chunk information. */
		WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));

		/*
		 * The chunk we are looking at should be the first one in the
		 * tree that we haven't already skipped over.
		 */
		WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk);
		__wt_free(session, chunk->bloom_uri);
		__wt_free(session, chunk->uri);
		__wt_free(session, lsm_tree->old_chunks[skipped]);

		/* Shuffle down to keep all occupied slots at the beginning. */
		if (--lsm_tree->nold_chunks > skipped) {
			memmove(lsm_tree->old_chunks + skipped,
			    lsm_tree->old_chunks + skipped + 1,
			    (lsm_tree->nold_chunks - skipped) *
			    sizeof(WT_LSM_CHUNK *));
			lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL;
		}

		WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));

		/*
		 * Clear the chunk in the cookie so we don't attempt to
		 * decrement the reference count.
		 */
		cookie.chunk_array[i] = NULL;
	}

err:	/* Flush the metadata unless the system is in panic */
	if (flush_metadata && ret != WT_PANIC) {
		WT_TRET(__wt_lsm_tree_writelock(session, lsm_tree));
		WT_TRET(__wt_lsm_meta_write(session, lsm_tree));
		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
	}
	__lsm_unpin_chunks(session, &cookie);
	__wt_free(session, cookie.chunk_array);
	lsm_tree->freeing_old_chunks = 0;

	/* Returning non-zero means there is no work to do. */
	if (!flush_metadata)
		WT_TRET(WT_NOTFOUND);

	return (ret);
}
Beispiel #29
0
/*
 * __wt_txn_recover --
 *	Run recovery.
 */
int
__wt_txn_recover(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *metac;
	WT_DECL_RET;
	WT_RECOVERY r;
	struct WT_RECOVERY_FILE *metafile;
	char *config;
	bool eviction_started, needs_rec, was_backup;

	conn = S2C(session);
	WT_CLEAR(r);
	WT_INIT_LSN(&r.ckpt_lsn);
	eviction_started = false;
	was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP);

	/* We need a real session for recovery. */
	WT_RET(__wt_open_internal_session(conn, "txn-recover",
	    false, WT_SESSION_NO_LOGGING, &session));
	r.session = session;

	F_SET(conn, WT_CONN_RECOVERING);
	WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
	WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
	WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac));
	metafile = &r.files[WT_METAFILE_ID];
	metafile->c = metac;

	/*
	 * If no log was found (including if logging is disabled), or if the
	 * last checkpoint was done with logging disabled, recovery should not
	 * run.  Scan the metadata to figure out the largest file ID.
	 */
	if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) ||
	    WT_IS_MAX_LSN(&metafile->ckpt_lsn)) {
		WT_ERR(__recovery_file_scan(&r));
		conn->next_file_id = r.max_fileid;
		goto done;
	}

	/*
	 * First, do a pass through the log to recover the metadata, and
	 * establish the last checkpoint LSN.  Skip this when opening a hot
	 * backup: we already have the correct metadata in that case.
	 */
	if (!was_backup) {
		r.metadata_only = true;
		/*
		 * If this is a read-only connection, check if the checkpoint
		 * LSN in the metadata file is up to date, indicating a clean
		 * shutdown.
		 */
		if (F_ISSET(conn, WT_CONN_READONLY)) {
			WT_ERR(__wt_log_needs_recovery(
			    session, &metafile->ckpt_lsn, &needs_rec));
			if (needs_rec)
				WT_ERR_MSG(session, WT_RUN_RECOVERY,
				    "Read-only database needs recovery");
		}
		if (WT_IS_INIT_LSN(&metafile->ckpt_lsn))
			WT_ERR(__wt_log_scan(session,
			    NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r));
		else {
			/*
			 * Start at the last checkpoint LSN referenced in the
			 * metadata.  If we see the end of a checkpoint while
			 * scanning, we will change the full scan to start from
			 * there.
			 */
			r.ckpt_lsn = metafile->ckpt_lsn;
			ret = __wt_log_scan(session,
			    &metafile->ckpt_lsn, 0, __txn_log_recover, &r);
			if (ret == ENOENT)
				ret = 0;
			WT_ERR(ret);
		}
	}

	/* Scan the metadata to find the live files and their IDs. */
	WT_ERR(__recovery_file_scan(&r));

	/*
	 * We no longer need the metadata cursor: close it to avoid pinning any
	 * resources that could block eviction during recovery.
	 */
	r.files[0].c = NULL;
	WT_ERR(metac->close(metac));

	/*
	 * Now, recover all the files apart from the metadata.
	 * Pass WT_LOGSCAN_RECOVER so that old logs get truncated.
	 */
	r.metadata_only = false;
	WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY,
	    "Main recovery loop: starting at %" PRIu32 "/%" PRIu32,
	    r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset));
	WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec));
	/*
	 * Check if the database was shut down cleanly.  If not
	 * return an error if the user does not want automatic
	 * recovery.
	 */
	if (needs_rec &&
	    (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) ||
	     F_ISSET(conn, WT_CONN_READONLY))) {
		if (F_ISSET(conn, WT_CONN_READONLY))
			WT_ERR_MSG(session, WT_RUN_RECOVERY,
			    "Read-only database needs recovery");
		WT_ERR(WT_RUN_RECOVERY);
	}

	if (F_ISSET(conn, WT_CONN_READONLY))
		goto done;

	/*
	 * Recovery can touch more data than fits in cache, so it relies on
	 * regular eviction to manage paging.  Start eviction threads for
	 * recovery without LAS cursors.
	 */
	WT_ERR(__wt_evict_create(session));
	eviction_started = true;

	/*
	 * Always run recovery even if it was a clean shutdown only if
	 * this is not a read-only connection.
	 * We can consider skipping it in the future.
	 */
	if (WT_IS_INIT_LSN(&r.ckpt_lsn))
		WT_ERR(__wt_log_scan(session, NULL,
		    WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER,
		    __txn_log_recover, &r));
	else {
		ret = __wt_log_scan(session, &r.ckpt_lsn,
		    WT_LOGSCAN_RECOVER, __txn_log_recover, &r);
		if (ret == ENOENT)
			ret = 0;
		WT_ERR(ret);
	}

	conn->next_file_id = r.max_fileid;

	/*
	 * If recovery ran successfully forcibly log a checkpoint so the next
	 * open is fast and keep the metadata up to date with the checkpoint
	 * LSN and archiving.
	 */
	WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));

done:	FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE);
err:	WT_TRET(__recovery_free(&r));
	__wt_free(session, config);

	if (ret != 0)
		__wt_err(session, ret, "Recovery failed");

	/*
	 * Destroy the eviction threads that were started in support of
	 * recovery.  They will be restarted once the lookaside table is
	 * created.
	 */
	if (eviction_started)
		WT_TRET(__wt_evict_destroy(session));

	WT_TRET(session->iface.close(&session->iface, NULL));
	F_CLR(conn, WT_CONN_RECOVERING);

	return (ret);
}
Beispiel #30
0
/*
 * __backup_start --
 *	Start a backup.
 */
static int
__backup_start(
    WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[])
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	int exist, log_only, target_list;

	conn = S2C(session);

	cb->next = 0;
	cb->list = NULL;

	/*
	 * Single thread hot backups: we're holding the schema lock, so we
	 * know we'll serialize with other attempts to start a hot backup.
	 */
	if (conn->hot_backup)
		WT_RET_MSG(
		    session, EINVAL, "there is already a backup cursor open");

	/*
	 * The hot backup copy is done outside of WiredTiger, which means file
	 * blocks can't be freed and re-allocated until the backup completes.
	 * The checkpoint code checks the backup flag, and if a backup cursor
	 * is open checkpoints aren't discarded.   We release the lock as soon
	 * as we've set the flag, we don't want to block checkpoints, we just
	 * want to make sure no checkpoints are deleted.  The checkpoint code
	 * holds the lock until it's finished the checkpoint, otherwise we
	 * could start a hot backup that would race with an already-started
	 * checkpoint.
	 */
	__wt_spin_lock(session, &conn->hot_backup_lock);
	conn->hot_backup = 1;
	__wt_spin_unlock(session, &conn->hot_backup_lock);

	/* Create the hot backup file. */
	WT_ERR(__backup_file_create(session, cb, 0));

	/* Add log files if logging is enabled. */

	/*
	 * If a list of targets was specified, work our way through them.
	 * Else, generate a list of all database objects.
	 *
	 * Include log files if doing a full backup, and copy them before
	 * copying data files to avoid rolling the metadata forward across
	 * a checkpoint that completes during the backup.
	 */
	target_list = 0;
	WT_ERR(__backup_uri(session, cb, cfg, &target_list, &log_only));

	if (!target_list) {
		WT_ERR(__backup_log_append(session, cb, 1));
		WT_ERR(__backup_all(session, cb));
	}

	/* Add the hot backup and standard WiredTiger files to the list. */
	if (log_only) {
		/*
		 * Close any hot backup file.
		 * We're about to open the incremental backup file.
		 */
		if (cb->bfp != NULL) {
			WT_TRET(fclose(cb->bfp) == 0 ? 0 : __wt_errno());
			cb->bfp = NULL;
		}
		WT_ERR(__backup_file_create(session, cb, log_only));
		WT_ERR(__backup_list_append(
		    session, cb, WT_INCREMENTAL_BACKUP));
	} else {
		WT_ERR(__backup_list_append(
		    session, cb, WT_METADATA_BACKUP));
		WT_ERR(__wt_exist(session, WT_BASECONFIG, &exist));
		if (exist)
			WT_ERR(__backup_list_append(
			    session, cb, WT_BASECONFIG));
		WT_ERR(__wt_exist(session, WT_USERCONFIG, &exist));
		if (exist)
			WT_ERR(__backup_list_append(
			    session, cb, WT_USERCONFIG));
		WT_ERR(__backup_list_append(session, cb, WT_WIREDTIGER));
	}

err:	/* Close the hot backup file. */
	if (cb->bfp != NULL) {
		WT_TRET(fclose(cb->bfp) == 0 ? 0 : __wt_errno());
		cb->bfp = NULL;
	}
	if (ret != 0) {
		WT_TRET(__backup_cleanup_handles(session, cb));
		WT_TRET(__backup_stop(session));
	}

	return (ret);
}