示例#1
0
/*
 * __wt_clsm_open_bulk --
 *	WT_SESSION->open_cursor method for LSM bulk cursors.
 */
int
__wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[])
{
	WT_CURSOR *cursor, *bulk_cursor;
	WT_DECL_RET;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;

	bulk_cursor = NULL;
	cursor = &clsm->iface;
	lsm_tree = clsm->lsm_tree;
	session = (WT_SESSION_IMPL *)clsm->iface.session;

	F_SET(clsm, WT_CLSM_BULK);

	/* Bulk cursors are limited to insert and close. */
	__wt_cursor_set_notsup(cursor);
	cursor->insert = __clsm_insert_bulk;
	cursor->close = __clsm_close_bulk;

	/*
	 * Setup the first chunk in the tree. This is the only time we switch
	 * without using the LSM worker threads, it's safe to do here since
	 * we have an exclusive lock on the LSM tree. We need to do this
	 * switch inline, since switch needs a schema lock and online index
	 * creation opens a bulk cursor while holding the schema lock.
	 */
	WT_WITH_SCHEMA_LOCK(session, ret,
	    ret = __wt_lsm_tree_switch(session, lsm_tree));
	WT_RET(ret);

	/*
	 * Open a bulk cursor on the first chunk, it's not a regular LSM chunk
	 * cursor, but use the standard storage locations. Allocate the space
	 * for a bloom filter - it makes cleanup simpler. Cleaned up by
	 * cursor close on error.
	 */
	WT_RET(__wt_calloc_one(session, &clsm->blooms));
	clsm->bloom_alloc = 1;
	WT_RET(__wt_calloc_one(session, &clsm->cursors));
	clsm->cursor_alloc = 1;
	clsm->nchunks = 1;

	/*
	 * Open a bulk cursor on the first chunk in the tree - take a read
	 * lock on the LSM tree while we are opening the chunk, to ensure
	 * that the first chunk has been fully created before we succeed.
	 * Pass through the application config to ensure the tree is open
	 * for bulk access.
	 */
	WT_RET(__wt_open_cursor(session,
	    lsm_tree->chunk[0]->uri, &clsm->iface, cfg, &bulk_cursor));
	clsm->cursors[0] = bulk_cursor;
	/* LSM cursors are always raw */
	F_SET(bulk_cursor, WT_CURSTD_RAW);

	return (0);
}
示例#2
0
/*
 * __wt_clsm_open_bulk --
 *	WT_SESSION->open_cursor method for LSM bulk cursors.
 */
int
__wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[])
{
	WT_CURSOR *cursor, *bulk_cursor;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;

	bulk_cursor = NULL;
	cursor = &clsm->iface;
	lsm_tree = clsm->lsm_tree;
	session = (WT_SESSION_IMPL *)clsm->iface.session;

	F_SET(clsm, WT_CLSM_BULK);

	/* Bulk cursors are limited to insert and close. */
	__wt_cursor_set_notsup(cursor);
	cursor->insert = __clsm_insert_bulk;
	cursor->close = __clsm_close_bulk;

	/* Setup the first chunk in the tree. */
	WT_RET(__wt_clsm_request_switch(clsm));
	WT_RET(__wt_clsm_await_switch(clsm));

	/*
	 * Grab and release the LSM tree lock to ensure that the first chunk
	 * has been fully created before proceeding. We have the LSM tree
	 * open exclusive, so that saves us from needing the lock generally.
	 */
	WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
	WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree));

	/*
	 * Open a bulk cursor on the first chunk, it's not a regular LSM chunk
	 * cursor, but use the standard storage locations. Allocate the space
	 * for a bloom filter - it makes cleanup simpler. Cleaned up by
	 * cursor close on error.
	 */
	WT_RET(__wt_calloc_one(session, &clsm->blooms));
	clsm->bloom_alloc = 1;
	WT_RET(__wt_calloc_one(session, &clsm->cursors));
	clsm->cursor_alloc = 1;
	clsm->nchunks = 1;

	/*
	 * Open a bulk cursor on the first chunk in the tree - take a read
	 * lock on the LSM tree while we are opening the chunk, to ensure
	 * that the first chunk has been fully created before we succeed.
	 * Pass through the application config to ensure the tree is open
	 * for bulk access.
	 */
	WT_RET(__wt_open_cursor(session,
	    lsm_tree->chunk[0]->uri, &clsm->iface, cfg, &bulk_cursor));
	clsm->cursors[0] = bulk_cursor;
	/* LSM cursors are always raw */
	F_SET(bulk_cursor, WT_CURSTD_RAW);

	return (0);
}
示例#3
0
/*创建一个connection evict cache*/
int __wt_cache_create(WT_SESSION_IMPL* session, const char* cfg[])
{
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;

	conn = S2C(session);

	WT_RET(__wt_calloc_one(session, &conn->cache));
	cache = conn->cache;

	/*对cache进行配置*/
	WT_RET(__wt_cache_config(session, 0, cfg));

	if (cache->eviction_target >= cache->eviction_trigger)
		WT_ERR_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger");

	/*创建evict cond信号量*/
	WT_ERR(__wt_cond_alloc(session, "cache eviction server", 0, &cache->evict_cond));
	WT_ERR(__wt_cond_alloc(session, "eviction waiters", 0, &cache->evict_waiter_cond));
	WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction"));
	WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk"));

	/* Allocate the LRU eviction queue. */
	cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
	WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict));

	/*初始化cache stat统计模块*/
	__wt_cache_stats_update(session);
	return 0;

err:
	WT_RET(__wt_cache_destroy(session));
	return ret;
}
示例#4
0
文件: os_fs.c 项目: GYGit/mongo
/*
 * __wt_os_posix --
 *	Initialize a POSIX configuration.
 */
int
__wt_os_posix(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_FILE_SYSTEM *file_system;

	conn = S2C(session);

	WT_RET(__wt_calloc_one(session, &file_system));

	/* Initialize the POSIX jump table. */
	file_system->fs_directory_list = __wt_posix_directory_list;
	file_system->fs_directory_list_free = __wt_posix_directory_list_free;
	file_system->fs_exist = __posix_fs_exist;
	file_system->fs_open_file = __posix_open_file;
	file_system->fs_remove = __posix_fs_remove;
	file_system->fs_rename = __posix_fs_rename;
	file_system->fs_size = __posix_fs_size;
	file_system->terminate = __posix_terminate;

	/* Switch it into place. */
	conn->file_system = file_system;

	return (0);
}
示例#5
0
文件: os_mtx_cond.c 项目: radik/mongo
/*
 * __wt_cond_alloc --
 *	Allocate and initialize a condition variable.
 */
int
__wt_cond_alloc(WT_SESSION_IMPL *session,
    const char *name, int is_signalled, WT_CONDVAR **condp)
{
	WT_CONDVAR *cond;
	WT_DECL_RET;

	/*
	 * !!!
	 * This function MUST handle a NULL session handle.
	 */
	WT_RET(__wt_calloc_one(session, &cond));

	WT_ERR(pthread_mutex_init(&cond->mtx, NULL));

	/* Initialize the condition variable to permit self-blocking. */
	WT_ERR(pthread_cond_init(&cond->cond, NULL));

	cond->name = name;
	cond->waiters = is_signalled ? -1 : 0;

	*condp = cond;
	return (0);

err:	__wt_free(session, cond);
	return (ret);
}
示例#6
0
文件: bloom.c 项目: 7segments/mongo-1
/*
 * __bloom_init --
 *	Allocate a WT_BLOOM handle.
 */
static int
__bloom_init(WT_SESSION_IMPL *session,
    const char *uri, const char *config, WT_BLOOM **bloomp)
{
	WT_BLOOM *bloom;
	WT_DECL_RET;
	size_t len;

	*bloomp = NULL;

	WT_RET(__wt_calloc_one(session, &bloom));

	WT_ERR(__wt_strdup(session, uri, &bloom->uri));
	len = strlen(WT_BLOOM_TABLE_CONFIG) + 2;
	if (config != NULL)
		len += strlen(config);
	WT_ERR(__wt_calloc_def(session, len, &bloom->config));
	/* Add the standard config at the end, so it overrides user settings. */
	(void)snprintf(bloom->config, len,
	    "%s,%s", config == NULL ? "" : config, WT_BLOOM_TABLE_CONFIG);

	bloom->session = session;

	*bloomp = bloom;
	return (0);

err:	__wt_free(session, bloom->uri);
	__wt_free(session, bloom->config);
	__wt_free(session, bloom->bitstring);
	__wt_free(session, bloom);
	return (ret);
}
示例#7
0
/*
 * __wt_dlopen --
 *	Open a dynamic library.
 */
int
__wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
{
	WT_DECL_RET;
	WT_DLH *dlh;

	WT_RET(__wt_calloc_one(session, &dlh));
	WT_ERR(__wt_strdup(session, path, &dlh->name));

	/* NULL means load from the current binary */
	if (path == NULL) {
		ret = GetModuleHandleExA(0, NULL, &dlh->handle);
		if (ret == FALSE)
			WT_ERR_MSG(session,
			    __wt_errno(), "GetModuleHandleEx(%s): %s", path, 0);
	} else {
		// TODO: load dll here
		DebugBreak();
	}

	/* Windows returns 0 on failure, WT expects 0 on success */
	ret = !ret;

	*dlhp = dlh;
	if (0) {
err:		__wt_free(session, dlh->name);
		__wt_free(session, dlh);
	}
	return (ret);
}
示例#8
0
文件: config_api.c 项目: ksuarz/mongo
/*
 * wiredtiger_config_parser_open --
 *	Create a configuration parser.
 */
int
wiredtiger_config_parser_open(WT_SESSION *wt_session,
    const char *config, size_t len, WT_CONFIG_PARSER **config_parserp)
{
	static const WT_CONFIG_PARSER stds = {
		__config_parser_close,
		__config_parser_next,
		__config_parser_get
	};
	WT_CONFIG_ITEM config_item =
	    { config, len, 0, WT_CONFIG_ITEM_STRING };
	WT_CONFIG_PARSER_IMPL *config_parser;
	WT_SESSION_IMPL *session;

	*config_parserp = NULL;
	session = (WT_SESSION_IMPL *)wt_session;

	WT_RET(__wt_calloc_one(session, &config_parser));
	config_parser->iface = stds;
	config_parser->session = session;

	/*
	 * Setup a WT_CONFIG_ITEM to be used for get calls and a WT_CONFIG
	 * structure for iterations through the configuration string.
	 */
	memcpy(&config_parser->config_item, &config_item, sizeof(config_item));
	__wt_config_initn(session, &config_parser->config, config, len);

	*config_parserp = (WT_CONFIG_PARSER *)config_parser;
	return (0);
}
示例#9
0
/*
 * __wt_dlopen --
 *	Open a dynamic library.
 */
int
__wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
{
	DWORD windows_error;
	WT_DECL_RET;
	WT_DLH *dlh;

	WT_RET(__wt_calloc_one(session, &dlh));
	WT_ERR(__wt_strdup(session, path, &dlh->name));
	WT_ERR(__wt_strdup(session, path == NULL ? "local" : path, &dlh->name));

	/* NULL means load from the current binary */
	if (path == NULL) {
		if (GetModuleHandleExA(
		    0, NULL, (HMODULE *)&dlh->handle) == FALSE) {
			windows_error = __wt_getlasterror();
			__wt_errx(session,
			    "GetModuleHandleEx: %s: %s",
			    path, __wt_formatmessage(session, windows_error));
			WT_ERR(__wt_map_windows_error(windows_error));
		}
	} else {
		// TODO: load dll here
		DebugBreak();
	}

	*dlhp = dlh;
	if (0) {
err:		__wt_free(session, dlh->name);
		__wt_free(session, dlh);
	}
	return (ret);
}
示例#10
0
文件: row_modify.c 项目: qihsh/mongo
/*
 * __wt_page_modify_alloc --
 *	Allocate a page's modification structure.
 */
int
__wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_CONNECTION_IMPL *conn;
	WT_PAGE_MODIFY *modify;

	conn = S2C(session);

	WT_RET(__wt_calloc_one(session, &modify));

	/*
	 * Select a spinlock for the page; let the barrier immediately below
	 * keep things from racing too badly.
	 */
	modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS;

	/*
	 * Multiple threads of control may be searching and deciding to modify
	 * a page.  If our modify structure is used, update the page's memory
	 * footprint, else discard the modify structure, another thread did the
	 * work.
	 */
	if (__wt_atomic_cas_ptr(&page->modify, NULL, modify))
		__wt_cache_page_inmem_incr(session, page, sizeof(*modify));
	else
		__wt_free(session, modify);
	return (0);
}
示例#11
0
文件: cur_backup.c 项目: DINKIN/mongo
/*
 * __wt_curbackup_open --
 *	WT_SESSION->open_cursor method for the backup cursor type.
 */
int
__wt_curbackup_open(WT_SESSION_IMPL *session,
    const char *uri, const char *cfg[], WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,		/* get-key */
	    __wt_cursor_get_value_notsup,	/* get-value */
	    __wt_cursor_set_key_notsup,		/* set-key */
	    __wt_cursor_set_value_notsup,	/* set-value */
	    __wt_cursor_compare_notsup,		/* compare */
	    __wt_cursor_equals_notsup,		/* equals */
	    __curbackup_next,			/* next */
	    __wt_cursor_notsup,			/* prev */
	    __curbackup_reset,			/* reset */
	    __wt_cursor_notsup,			/* search */
	    __wt_cursor_search_near_notsup,	/* search-near */
	    __wt_cursor_notsup,			/* insert */
	    __wt_cursor_modify_notsup,		/* modify */
	    __wt_cursor_notsup,			/* update */
	    __wt_cursor_notsup,			/* remove */
	    __wt_cursor_notsup,			/* reserve */
	    __wt_cursor_reconfigure_notsup,	/* reconfigure */
	    __curbackup_close);			/* close */
	WT_CURSOR *cursor;
	WT_CURSOR_BACKUP *cb;
	WT_DECL_RET;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_BACKUP, iface) == 0);

	cb = NULL;

	WT_RET(__wt_calloc_one(session, &cb));
	cursor = &cb->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	session->bkp_cursor = cb;

	cursor->key_format = "S";	/* Return the file names as the key. */
	cursor->value_format = "";	/* No value. */

	/*
	 * Start the backup and fill in the cursor's list.  Acquire the schema
	 * lock, we need a consistent view when creating a copy.
	 */
	WT_WITH_CHECKPOINT_LOCK(session,
	    WT_WITH_SCHEMA_LOCK(session,
		ret = __backup_start(session, cb, cfg)));
	WT_ERR(ret);

	WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));

	if (0) {
err:		WT_TRET(__curbackup_close(cursor));
		*cursorp = NULL;
	}

	return (ret);
}
示例#12
0
/*
 * __wt_logmgr_create --
 *	Initialize the log subsystem (before running recovery).
 */
int
__wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;
	bool run;

	conn = S2C(session);

	/* Handle configuration. */
	WT_RET(__logmgr_config(session, cfg, &run, false));

	/* If logging is not configured, we're done. */
	if (!run)
		return (0);

	FLD_SET(conn->log_flags, WT_CONN_LOG_ENABLED);
	/*
	 * Logging is on, allocate the WT_LOG structure and open the log file.
	 */
	WT_RET(__wt_calloc_one(session, &conn->log));
	log = conn->log;
	WT_RET(__wt_spin_init(session, &log->log_lock, "log"));
	WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot"));
	WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync"));
	WT_RET(__wt_spin_init(session, &log->log_writelsn_lock,
	    "log write LSN"));
	WT_RET(__wt_rwlock_alloc(session,
	    &log->log_archive_lock, "log archive lock"));
	if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG))
		log->allocsize =
		    WT_MAX((uint32_t)conn->buffer_alignment, WT_LOG_ALIGN);
	else
		log->allocsize = WT_LOG_ALIGN;
	WT_INIT_LSN(&log->alloc_lsn);
	WT_INIT_LSN(&log->ckpt_lsn);
	WT_INIT_LSN(&log->first_lsn);
	WT_INIT_LSN(&log->sync_lsn);
	/*
	 * We only use file numbers for directory sync, so this needs to
	 * initialized to zero.
	 */
	WT_ZERO_LSN(&log->sync_dir_lsn);
	WT_INIT_LSN(&log->trunc_lsn);
	WT_INIT_LSN(&log->write_lsn);
	WT_INIT_LSN(&log->write_start_lsn);
	log->fileid = 0;
	WT_RET(__wt_cond_alloc(
	    session, "log sync", false, &log->log_sync_cond));
	WT_RET(__wt_cond_alloc(
	    session, "log write", false, &log->log_write_cond));
	WT_RET(__wt_log_open(session));
	WT_RET(__wt_log_slot_init(session));

	return (0);
}
示例#13
0
/*
 * __wt_cache_create --
 *	Create the underlying cache.
 */
int
__wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;

	conn = S2C(session);

	WT_ASSERT(session, conn->cache == NULL);

	WT_RET(__wt_calloc_one(session, &conn->cache));

	cache = conn->cache;

	/* Use a common routine for run-time configuration options. */
	WT_RET(__wt_cache_config(session, false, cfg));

	/*
	 * The lowest possible page read-generation has a special meaning, it
	 * marks a page for forcible eviction; don't let it happen by accident.
	 */
	cache->read_gen = WT_READGEN_START_VALUE;

	/*
	 * The target size must be lower than the trigger size or we will never
	 * get any work done.
	 */
	if (cache->eviction_target >= cache->eviction_trigger)
		WT_ERR_MSG(session, EINVAL,
		    "eviction target must be lower than the eviction trigger");

	WT_ERR(__wt_cond_auto_alloc(session, "cache eviction server",
	    false, 10000, WT_MILLION, &cache->evict_cond));
	WT_ERR(__wt_cond_alloc(session,
	    "eviction waiters", false, &cache->evict_waiter_cond));
	WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction"));
	WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk"));

	/* Allocate the LRU eviction queue. */
	cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
	WT_ERR(__wt_calloc_def(session,
	    cache->evict_slots, &cache->evict_queue));

	/*
	 * We get/set some values in the cache statistics (rather than have
	 * two copies), configure them.
	 */
	__wt_cache_stats_update(session);
	return (0);

err:	WT_RET(__wt_cache_destroy(session));
	return (ret);
}
示例#14
0
/*
 * __curjoin_entry_iter_init --
 *	Initialize an iteration for the index managed by a join entry.
 *
 */
static int
__curjoin_entry_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    WT_CURSOR_JOIN_ENTRY *entry, WT_CURSOR_JOIN_ITER **iterp)
{
	WT_CURSOR *newcur;
	WT_CURSOR *to_dup;
	WT_DECL_RET;
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), "raw", NULL };
	const char *def_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), NULL };
	const char *uri, **config;
	char *uribuf;
	WT_CURSOR_JOIN_ITER *iter;
	size_t size;

	iter = NULL;
	uribuf = NULL;
	to_dup = entry->ends[0].cursor;

	uri = to_dup->uri;
	if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW))
		config = &raw_cfg[0];
	else
		config = &def_cfg[0];

	if (cjoin->projection != NULL) {
		size = strlen(uri) + strlen(cjoin->projection) + 1;
		WT_ERR(__wt_calloc(session, size, 1, &uribuf));
		snprintf(uribuf, size, "%s%s", uri, cjoin->projection);
		uri = uribuf;
	}
	WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)cjoin, config,
	    &newcur));
	WT_ERR(__wt_cursor_dup_position(to_dup, newcur));
	WT_ERR(__wt_calloc_one(session, &iter));
	iter->cjoin = cjoin;
	iter->session = session;
	iter->entry = entry;
	iter->cursor = newcur;
	iter->advance = false;
	*iterp = iter;

	if (0) {
err:		__wt_free(session, iter);
	}
	__wt_free(session, uribuf);
	return (ret);
}
示例#15
0
文件: os_mtx_rw.c 项目: qihsh/mongo
/*
 * __wt_rwlock_alloc --
 *	Allocate and initialize a read/write lock.
 */
int
__wt_rwlock_alloc(
    WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name)
{
	WT_RWLOCK *rwlock;

	WT_RET(__wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name));

	WT_RET(__wt_calloc_one(session, &rwlock));

	rwlock->name = name;

	*rwlockp = rwlock;
	return (0);
}
示例#16
0
文件: cur_join.c 项目: mongodb/mongo
/*
 * __curjoin_iter_init --
 *	Initialize an iteration for the index managed by a join entry.
 */
static int
__curjoin_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    WT_CURSOR_JOIN_ITER **iterp)
{
	WT_CURSOR_JOIN_ITER *iter;

	*iterp = NULL;

	WT_RET(__wt_calloc_one(session, iterp));
	iter = *iterp;
	iter->cjoin = cjoin;
	iter->session = session;
	cjoin->iter = iter;
	WT_RET(__curjoin_iter_set_entry(iter, 0));
	return (0);
}
示例#17
0
/*
 * __wt_cache_create --
 *	Create the underlying cache.
 */
int
__wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CACHE *cache;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;

	conn = S2C(session);

	WT_ASSERT(session, conn->cache == NULL);

	WT_RET(__wt_calloc_one(session, &conn->cache));

	cache = conn->cache;

	/* Use a common routine for run-time configuration options. */
	WT_RET(__wt_cache_config(session, 0, cfg));

	/*
	 * The target size must be lower than the trigger size or we will never
	 * get any work done.
	 */
	if (cache->eviction_target >= cache->eviction_trigger)
		WT_ERR_MSG(session, EINVAL,
		    "eviction target must be lower than the eviction trigger");

	WT_ERR(__wt_cond_alloc(session,
	    "cache eviction server", 0, &cache->evict_cond));
	WT_ERR(__wt_cond_alloc(session,
	    "eviction waiters", 0, &cache->evict_waiter_cond));
	WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction"));
	WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk"));

	/* Allocate the LRU eviction queue. */
	cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
	WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict));

	/*
	 * We get/set some values in the cache statistics (rather than have
	 * two copies), configure them.
	 */
	__wt_cache_stats_update(session);
	return (0);

err:	WT_RET(__wt_cache_destroy(session));
	return (ret);
}
示例#18
0
/*
 * __wt_cond_alloc --
 *	Allocate and initialize a condition variable.
 */
int
__wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp)
{
	WT_CONDVAR *cond;

	WT_RET(__wt_calloc_one(session, &cond));

	InitializeCriticalSection(&cond->mtx);

	/* Initialize the condition variable to permit self-blocking. */
	InitializeConditionVariable(&cond->cond);

	cond->name = name;
	cond->waiters = 0;

	*condp = cond;
	return (0);
}
示例#19
0
/*
 * __wt_curconfig_open --
 *	WT_SESSION->open_cursor method for config cursors.
 */
int
__wt_curconfig_open(WT_SESSION_IMPL *session,
    const char *uri, const char *cfg[], WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,		/* get-key */
	    __wt_cursor_get_value,		/* get-value */
	    __wt_cursor_set_key,		/* set-key */
	    __wt_cursor_set_value,		/* set-value */
	    __wt_cursor_compare_notsup,		/* compare */
	    __wt_cursor_equals_notsup,		/* equals */
	    __wt_cursor_notsup,			/* next */
	    __wt_cursor_notsup,			/* prev */
	    __wt_cursor_noop,			/* reset */
	    __wt_cursor_notsup,			/* search */
	    __wt_cursor_search_near_notsup,	/* search-near */
	    __wt_cursor_notsup,			/* insert */
	    __wt_cursor_notsup,			/* update */
	    __wt_cursor_notsup,			/* remove */
	    __wt_cursor_reconfigure_notsup,	/* reconfigure */
	    __curconfig_close);
	WT_CURSOR_CONFIG *cconfig;
	WT_CURSOR *cursor;
	WT_DECL_RET;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_CONFIG, iface) == 0);

	WT_UNUSED(uri);

	WT_RET(__wt_calloc_one(session, &cconfig));

	cursor = &cconfig->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	cursor->key_format = cursor->value_format = "S";

	/* __wt_cursor_init is last so we don't have to clean up on error. */
	WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));

	if (0) {
err:		__wt_free(session, cconfig);
	}
	return (ret);
}
示例#20
0
/*
 * wiredtiger_pack_start --
 *	Open a stream for packing.
 */
int
wiredtiger_pack_start(WT_SESSION *wt_session,
	const char *format, void *buffer, size_t len, WT_PACK_STREAM **psp)
{
	WT_DECL_RET;
	WT_PACK_STREAM *ps;
	WT_SESSION_IMPL *session;

	session = (WT_SESSION_IMPL *)wt_session;
	WT_RET(__wt_calloc_one(session, &ps));
	WT_ERR(__pack_init(session, &ps->pack, format));
	ps->p = ps->start = buffer;
	ps->end = ps->p + len;
	*psp = ps;

	if (0) {
err:		(void)wiredtiger_pack_close(ps, NULL);
	}
	return (ret);
}
示例#21
0
/*
 * __wt_dlopen --
 *	Open a dynamic library.
 */
int
__wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
{
	WT_DECL_RET;
	WT_DLH *dlh;

	WT_RET(__wt_calloc_one(session, &dlh));
	WT_ERR(__wt_strdup(session, path, &dlh->name));

	if ((dlh->handle = dlopen(path, RTLD_LAZY)) == NULL)
		WT_ERR_MSG(
		    session, __wt_errno(), "dlopen(%s): %s", path, dlerror());

	*dlhp = dlh;
	if (0) {
err:		__wt_free(session, dlh->name);
		__wt_free(session, dlh);
	}
	return (ret);
}
示例#22
0
/*
 * __wt_fopen --
 *	Open a stream handle.
 */
int
__wt_fopen(WT_SESSION_IMPL *session,
    const char *name, uint32_t open_flags, uint32_t flags, WT_FSTREAM **fstrp)
{
	WT_DECL_RET;
	WT_FH *fh;
	WT_FSTREAM *fstr;

	*fstrp = NULL;

	fstr = NULL;

	WT_RET(__wt_open(
	    session, name, WT_OPEN_FILE_TYPE_REGULAR, open_flags, &fh));

	WT_ERR(__wt_calloc_one(session, &fstr));
	fstr->fh = fh;
	fstr->name = fh->name;
	fstr->flags = flags;

	fstr->close = __fstream_close;
	WT_ERR(__wt_filesize(session, fh, &fstr->size));
	if (LF_ISSET(WT_STREAM_APPEND))
		fstr->off = fstr->size;
	if (LF_ISSET(WT_STREAM_APPEND | WT_STREAM_WRITE)) {
		fstr->fstr_flush = __fstream_flush;
		fstr->fstr_getline = __fstream_getline_notsup;
		fstr->fstr_printf = __fstream_printf;
	} else {
		WT_ASSERT(session, LF_ISSET(WT_STREAM_READ));
		fstr->fstr_flush = __fstream_flush_notsup;
		fstr->fstr_getline = __fstream_getline;
		fstr->fstr_printf = __fstream_printf_notsup;
	}
	*fstrp = fstr;
	return (0);

err:	WT_TRET(__wt_close(session, &fh));
	__wt_free(session, fstr);
	return (ret);
}
示例#23
0
/*
 * __wt_block_manager_open --
 *	Open a file.
 */
int
__wt_block_manager_open(WT_SESSION_IMPL *session,
    const char *filename, const char *cfg[],
    int forced_salvage, int readonly, uint32_t allocsize, WT_BM **bmp)
{
	WT_BM *bm;
	WT_DECL_RET;

	*bmp = NULL;

	WT_RET(__wt_calloc_one(session, &bm));
	__bm_method_set(bm, 0);

	WT_ERR(__wt_block_open(session, filename, cfg,
	    forced_salvage, readonly, allocsize, &bm->block));

	*bmp = bm;
	return (0);

err:	WT_TRET(bm->close(bm, session));
	return (ret);
}
示例#24
0
/*
 * __wt_cond_alloc --
 *	Allocate and initialize a condition variable.
 */
int
__wt_cond_alloc(WT_SESSION_IMPL *session,
                const char *name, bool is_signalled, WT_CONDVAR **condp)
{
    WT_CONDVAR *cond;

    /*
     * !!!
     * This function MUST handle a NULL session handle.
     */
    WT_RET(__wt_calloc_one(session, &cond));

    InitializeCriticalSection(&cond->mtx);

    /* Initialize the condition variable to permit self-blocking. */
    InitializeConditionVariable(&cond->cond);

    cond->name = name;
    cond->waiters = is_signalled ? -1 : 0;

    *condp = cond;
    return (0);
}
示例#25
0
/*
 * __wt_open --
 *	Open a file handle.
 */
int
__wt_open(WT_SESSION_IMPL *session,
    const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp)
{
	DWORD dwCreationDisposition;
	HANDLE filehandle, filehandle_secondary;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *fh, *tfh;
	uint64_t bucket, hash;
	int direct_io, f, matched, share_mode;
	char *path;

	conn = S2C(session);
	fh = NULL;
	path = NULL;
	filehandle = INVALID_HANDLE_VALUE;
	filehandle_secondary = INVALID_HANDLE_VALUE;
	direct_io = 0;
	hash = __wt_hash_city64(name, strlen(name));
	bucket = hash % WT_HASH_ARRAY_SIZE;

	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name));

	/* Increment the reference count if we already have the file open. */
	matched = 0;
	__wt_spin_lock(session, &conn->fh_lock);
	SLIST_FOREACH(tfh, &conn->fhhash[bucket], l)
		if (strcmp(name, tfh->name) == 0) {
			++tfh->ref;
			*fhp = tfh;
			matched = 1;
			break;
		}
	__wt_spin_unlock(session, &conn->fh_lock);
	if (matched)
		return (0);

	/* For directories, create empty file handles with invalid handles */
	if (dio_type == WT_FILE_TYPE_DIRECTORY) {
		goto setupfh;
	}

	WT_RET(__wt_filename(session, name, &path));

	share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
	/*
	 * Security:
	 * The application may spawn a new process, and we don't want another
	 * process to have access to our file handles.
	 *
	 * TODO: Set tighter file permissions but set bInheritHandle to false
	 * to prevent inheritance
	 */

	f = FILE_ATTRIBUTE_NORMAL;

	dwCreationDisposition = 0;
	if (ok_create) {
		dwCreationDisposition = CREATE_NEW;
		if (exclusive)
			dwCreationDisposition = CREATE_ALWAYS;
	} else
		dwCreationDisposition = OPEN_EXISTING;

	if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) {
		f |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
		direct_io = 1;
	}

	if (dio_type == WT_FILE_TYPE_LOG &&
	    FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) {
		f |= FILE_FLAG_WRITE_THROUGH;
	}

	/* Disable read-ahead on trees: it slows down random read workloads. */
	if (dio_type == WT_FILE_TYPE_DATA ||
	    dio_type == WT_FILE_TYPE_CHECKPOINT)
		f |= FILE_FLAG_RANDOM_ACCESS;

	filehandle = CreateFileA(path,
				(GENERIC_READ | GENERIC_WRITE),
				share_mode,
				NULL,
				dwCreationDisposition,
				f,
				NULL);
	if (filehandle == INVALID_HANDLE_VALUE) {
		if (GetLastError() == ERROR_FILE_EXISTS && ok_create)
			filehandle = CreateFileA(path,
						(GENERIC_READ | GENERIC_WRITE),
						share_mode,
						NULL,
						OPEN_EXISTING,
						f,
						NULL);

		if (filehandle == INVALID_HANDLE_VALUE)
			WT_ERR_MSG(session, __wt_errno(),
			    direct_io ?
			    "%s: open failed with direct I/O configured, some "
			    "filesystem types do not support direct I/O" :
			    "%s", path);
	}

	/*
	 * Open a second handle to file to support allocation/truncation
	 * concurrently with reads on the file. Writes would also move the file
	 * pointer.
	 */
	filehandle_secondary = CreateFileA(path,
	    (GENERIC_READ | GENERIC_WRITE),
	    share_mode,
	    NULL,
	    OPEN_EXISTING,
	    f,
	    NULL);
	if (filehandle == INVALID_HANDLE_VALUE)
		WT_ERR_MSG(session, __wt_errno(),
		    "open failed for secondary handle: %s", path);

setupfh:
	WT_ERR(__wt_calloc_one(session, &fh));
	WT_ERR(__wt_strdup(session, name, &fh->name));
	fh->name_hash = hash;
	fh->filehandle = filehandle;
	fh->filehandle_secondary = filehandle_secondary;
	fh->ref = 1;
	fh->direct_io = direct_io;

	/* Set the file's size. */
	if (dio_type != WT_FILE_TYPE_DIRECTORY)
		WT_ERR(__wt_filesize(session, fh, &fh->size));

	/* Configure file extension. */
	if (dio_type == WT_FILE_TYPE_DATA ||
	    dio_type == WT_FILE_TYPE_CHECKPOINT)
		fh->extend_len = conn->data_extend_len;

	/* Configure fallocate/posix_fallocate calls. */
	__wt_fallocate_config(session, fh);

	/*
	 * Repeat the check for a match, but then link onto the database's list
	 * of files.
	 */
	matched = 0;
	__wt_spin_lock(session, &conn->fh_lock);
	SLIST_FOREACH(tfh, &conn->fhhash[bucket], l)
		if (strcmp(name, tfh->name) == 0) {
			++tfh->ref;
			*fhp = tfh;
			matched = 1;
			break;
		}
	if (!matched) {
		WT_CONN_FILE_INSERT(conn, fh, bucket);
		WT_STAT_FAST_CONN_INCR(session, file_open);

		*fhp = fh;
	}
	__wt_spin_unlock(session, &conn->fh_lock);
	if (matched) {
err:		if (fh != NULL) {
			__wt_free(session, fh->name);
			__wt_free(session, fh);
		}
		if (filehandle != INVALID_HANDLE_VALUE)
			(void)CloseHandle(filehandle);
		if (filehandle_secondary != INVALID_HANDLE_VALUE)
			(void)CloseHandle(filehandle_secondary);
	}

	__wt_free(session, path);
	return (ret);
}
示例#26
0
/*
 * __schema_open_table --
 *	Open a named table (internal version).
 */
static int
__schema_open_table(WT_SESSION_IMPL *session,
    const char *name, size_t namelen, bool ok_incomplete, WT_TABLE **tablep)
{
	WT_CONFIG cparser;
	WT_CONFIG_ITEM ckey, cval;
	WT_CURSOR *cursor;
	WT_DECL_ITEM(buf);
	WT_DECL_RET;
	WT_TABLE *table;
	const char *tconfig;
	char *tablename;

	cursor = NULL;
	table = NULL;
	tablename = NULL;

	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE));

	WT_ERR(__wt_scr_alloc(session, 0, &buf));
	WT_ERR(__wt_buf_fmt(session, buf, "table:%.*s", (int)namelen, name));
	WT_ERR(__wt_strndup(session, buf->data, buf->size, &tablename));

	WT_ERR(__wt_metadata_cursor(session, &cursor));
	cursor->set_key(cursor, tablename);
	WT_ERR(cursor->search(cursor));
	WT_ERR(cursor->get_value(cursor, &tconfig));

	WT_ERR(__wt_calloc_one(session, &table));
	table->name = tablename;
	tablename = NULL;
	table->name_hash = __wt_hash_city64(name, namelen);

	WT_ERR(__wt_config_getones(session, tconfig, "columns", &cval));

	WT_ERR(__wt_config_getones(session, tconfig, "key_format", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->key_format));
	WT_ERR(__wt_config_getones(session, tconfig, "value_format", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len, &table->value_format));
	WT_ERR(__wt_strdup(session, tconfig, &table->config));

	/* Point to some items in the copy to save re-parsing. */
	WT_ERR(__wt_config_getones(session, table->config,
	    "columns", &table->colconf));

	/*
	 * Count the number of columns: tables are "simple" if the columns
	 * are not named.
	 */
	WT_ERR(__wt_config_subinit(session, &cparser, &table->colconf));
	table->is_simple = true;
	while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
		table->is_simple = false;
	if (ret != WT_NOTFOUND)
		goto err;

	/* Check that the columns match the key and value formats. */
	if (!table->is_simple)
		WT_ERR(__wt_schema_colcheck(session,
		    table->key_format, table->value_format, &table->colconf,
		    &table->nkey_columns, NULL));

	WT_ERR(__wt_config_getones(session, table->config,
	    "colgroups", &table->cgconf));

	/* Count the number of column groups. */
	WT_ERR(__wt_config_subinit(session, &cparser, &table->cgconf));
	table->ncolgroups = 0;
	while ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
		++table->ncolgroups;
	if (ret != WT_NOTFOUND)
		goto err;

	if (table->ncolgroups > 0 && table->is_simple)
		WT_ERR_MSG(session, EINVAL,
		    "%s requires a table with named columns", tablename);

	WT_ERR(__wt_calloc_def(session, WT_COLGROUPS(table), &table->cgroups));
	WT_ERR(__wt_schema_open_colgroups(session, table));

	if (!ok_incomplete && !table->cg_complete)
		WT_ERR_MSG(session, EINVAL, "'%s' cannot be used "
		    "until all column groups are created",
		    table->name);

	/* Copy the schema generation into the new table. */
	table->schema_gen = S2C(session)->schema_gen;

	*tablep = table;

	if (0) {
err:		WT_TRET(__wt_schema_destroy_table(session, &table));
	}
	WT_TRET(__wt_metadata_cursor_release(session, &cursor));

	__wt_free(session, tablename);
	__wt_scr_free(session, &buf);
	return (ret);
}
示例#27
0
/*
 * __wt_schema_open_colgroups --
 *	Open the column groups for a table.
 */
int
__wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table)
{
	WT_COLGROUP *colgroup;
	WT_CONFIG cparser;
	WT_CONFIG_ITEM ckey, cval;
	WT_DECL_RET;
	WT_DECL_ITEM(buf);
	char *cgconfig;
	u_int i;

	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE));

	if (table->cg_complete)
		return (0);

	colgroup = NULL;
	cgconfig = NULL;

	WT_RET(__wt_scr_alloc(session, 0, &buf));

	WT_ERR(__wt_config_subinit(session, &cparser, &table->cgconf));

	/* Open each column group. */
	for (i = 0; i < WT_COLGROUPS(table); i++) {
		if (table->ncolgroups > 0)
			WT_ERR(__wt_config_next(&cparser, &ckey, &cval));
		else
			WT_CLEAR(ckey);

		/*
		 * Always open from scratch: we may have failed part of the way
		 * through opening a table, or column groups may have changed.
		 */
		__wt_schema_destroy_colgroup(session, &table->cgroups[i]);

		WT_ERR(__wt_buf_init(session, buf, 0));
		WT_ERR(__wt_schema_colgroup_name(session, table,
		    ckey.str, ckey.len, buf));
		if ((ret = __wt_metadata_search(
		    session, buf->data, &cgconfig)) != 0) {
			/* It is okay if the table is incomplete. */
			if (ret == WT_NOTFOUND)
				ret = 0;
			goto err;
		}

		WT_ERR(__wt_calloc_one(session, &colgroup));
		WT_ERR(__wt_strndup(
		    session, buf->data, buf->size, &colgroup->name));
		colgroup->config = cgconfig;
		cgconfig = NULL;
		WT_ERR(__wt_config_getones(session,
		    colgroup->config, "columns", &colgroup->colconf));
		WT_ERR(__wt_config_getones(
		    session, colgroup->config, "source", &cval));
		WT_ERR(__wt_strndup(
		    session, cval.str, cval.len, &colgroup->source));
		table->cgroups[i] = colgroup;
		colgroup = NULL;
	}

	if (!table->is_simple) {
		WT_ERR(__wt_table_check(session, table));

		WT_ERR(__wt_buf_init(session, buf, 0));
		WT_ERR(__wt_struct_plan(session,
		    table, table->colconf.str, table->colconf.len, true, buf));
		WT_ERR(__wt_strndup(
		    session, buf->data, buf->size, &table->plan));
	}

	table->cg_complete = true;

err:	__wt_scr_free(session, &buf);
	__wt_schema_destroy_colgroup(session, &colgroup);
	if (cgconfig != NULL)
		__wt_free(session, cgconfig);
	return (ret);
}
示例#28
0
/*
 * __schema_open_index --
 *	Open one or more indices for a table (internal version).
 */
static int
__schema_open_index(WT_SESSION_IMPL *session,
    WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp)
{
	WT_CURSOR *cursor;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_INDEX *idx;
	u_int i;
	int cmp;
	bool match;
	const char *idxconf, *name, *tablename, *uri;

	/* Check if we've already done the work. */
	if (idxname == NULL && table->idx_complete)
		return (0);

	cursor = NULL;
	idx = NULL;
	match = false;

	/* Build a search key. */
	tablename = table->name;
	(void)WT_PREFIX_SKIP(tablename, "table:");
	WT_ERR(__wt_scr_alloc(session, 512, &tmp));
	WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename));

	/* Find matching indices. */
	WT_ERR(__wt_metadata_cursor(session, &cursor));
	cursor->set_key(cursor, tmp->data);
	if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
		ret = cursor->next(cursor);
	for (i = 0; ret == 0; i++, ret = cursor->next(cursor)) {
		WT_ERR(cursor->get_key(cursor, &uri));
		name = uri;
		if (!WT_PREFIX_SKIP(name, tmp->data))
			break;

		/* Is this the index we are looking for? */
		match = idxname == NULL || WT_STRING_MATCH(name, idxname, len);

		/*
		 * Ensure there is space, including if we have to make room for
		 * a new entry in the middle of the list.
		 */
		WT_ERR(__wt_realloc_def(session, &table->idx_alloc,
		    WT_MAX(i, table->nindices) + 1, &table->indices));

		/* Keep the in-memory list in sync with the metadata. */
		cmp = 0;
		while (table->indices[i] != NULL &&
		    (cmp = strcmp(uri, table->indices[i]->name)) > 0) {
			/* Index no longer exists, remove it. */
			__wt_free(session, table->indices[i]);
			memmove(&table->indices[i], &table->indices[i + 1],
			    (table->nindices - i) * sizeof(WT_INDEX *));
			table->indices[--table->nindices] = NULL;
		}
		if (cmp < 0) {
			/* Make room for a new index. */
			memmove(&table->indices[i + 1], &table->indices[i],
			    (table->nindices - i) * sizeof(WT_INDEX *));
			table->indices[i] = NULL;
			++table->nindices;
		}

		if (!match)
			continue;

		if (table->indices[i] == NULL) {
			WT_ERR(cursor->get_value(cursor, &idxconf));
			WT_ERR(__wt_calloc_one(session, &idx));
			WT_ERR(__wt_strdup(session, uri, &idx->name));
			WT_ERR(__wt_strdup(session, idxconf, &idx->config));
			WT_ERR(__open_index(session, table, idx));

			/*
			 * If we're checking the creation of an index before a
			 * table is fully created, don't save the index: it
			 * will need to be reopened once the table is complete.
			 */
			if (!table->cg_complete) {
				WT_ERR(
				    __wt_schema_destroy_index(session, &idx));
				if (idxname != NULL)
					break;
				continue;
			}

			table->indices[i] = idx;
			idx = NULL;

			/*
			 * If the slot is bigger than anything else we've seen,
			 * bump the number of indices.
			 */
			if (i >= table->nindices)
				table->nindices = i + 1;
		}

		/* If we were looking for a single index, we're done. */
		if (indexp != NULL)
			*indexp = table->indices[i];
		if (idxname != NULL)
			break;
	}
	WT_ERR_NOTFOUND_OK(ret);
	if (idxname != NULL && !match)
		ret = WT_NOTFOUND;

	/* If we did a full pass, we won't need to do it again. */
	if (idxname == NULL) {
		table->nindices = i;
		table->idx_complete = true;
	}

err:	WT_TRET(__wt_metadata_cursor_release(session, &cursor));
	WT_TRET(__wt_schema_destroy_index(session, &idx));

	__wt_scr_free(session, &tmp);
	return (ret);
}
示例#29
0
/*
 * __wt_delete_page --
 *	If deleting a range, try to delete the page without instantiating it.
 */
int
__wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
	WT_DECL_RET;
	WT_PAGE *parent;

	*skipp = false;

	/* If we have a clean page in memory, attempt to evict it. */
	if (ref->state == WT_REF_MEM &&
	    __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
		if (__wt_page_is_modified(ref->page)) {
			WT_PUBLISH(ref->state, WT_REF_MEM);
			return (0);
		}

		(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
		ret = __wt_evict_page(session, ref);
		(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
		WT_RET_BUSY_OK(ret);
	}

	/*
	 * Atomically switch the page's state to lock it.  If the page is not
	 * on-disk, other threads may be using it, no fast delete.
	 *
	 * Possible optimization: if the page is already deleted and the delete
	 * is visible to us (the delete has been committed), we could skip the
	 * page instead of instantiating it and figuring out there are no rows
	 * in the page.  While that's a huge amount of work to no purpose, it's
	 * unclear optimizing for overlapping range deletes is worth the effort.
	 */
	if (ref->state != WT_REF_DISK ||
	    !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED))
		return (0);

	/*
	 * We cannot fast-delete pages that have overflow key/value items as
	 * the overflow blocks have to be discarded.  The way we figure that
	 * out is to check the on-page cell type for the page, cells for leaf
	 * pages that have no overflow items are special.
	 *
	 * In some cases, the reference address may not reference an on-page
	 * cell (for example, some combination of page splits), in which case
	 * we can't check the original cell value and we fail.
	 *
	 * To look at an on-page cell, we need to look at the parent page, and
	 * that's dangerous, our parent page could change without warning if
	 * the parent page were to split, deepening the tree.  It's safe: the
	 * page's reference will always point to some valid page, and if we find
	 * any problems we simply fail the fast-delete optimization.
	 *
	 * !!!
	 * I doubt it's worth the effort, but we could copy the cell's type into
	 * the reference structure, and then we wouldn't need an on-page cell.
	 */
	parent = ref->home;
	if (__wt_off_page(parent, ref->addr) ||
	    __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO)
		goto err;

	/*
	 * This action dirties the parent page: mark it dirty now, there's no
	 * future reconciliation of the child leaf page that will dirty it as
	 * we write the tree.
	 */
	WT_ERR(__wt_page_parent_modify_set(session, ref, false));

	/*
	 * Record the change in the transaction structure and set the change's
	 * transaction ID.
	 */
	WT_ERR(__wt_calloc_one(session, &ref->page_del));
	ref->page_del->txnid = session->txn.id;

	WT_ERR(__wt_txn_modify_ref(session, ref));

	*skipp = true;
	WT_PUBLISH(ref->state, WT_REF_DELETED);
	return (0);

err:	__wt_free(session, ref->page_del);

	/*
	 * Restore the page to on-disk status, we'll have to instantiate it.
	 */
	WT_PUBLISH(ref->state, WT_REF_DISK);
	return (ret);
}
示例#30
0
/*
 * __wt_delete_page_instantiate --
 *	Instantiate an entirely deleted row-store leaf page.
 */
int
__wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_DELETED *page_del;
	WT_UPDATE **upd_array, *upd;
	size_t size;
	uint32_t i;

	btree = S2BT(session);
	page = ref->page;
	page_del = ref->page_del;

	/*
	 * Give the page a modify structure.
	 *
	 * If the tree is already dirty and so will be written, mark the page
	 * dirty.  (We'd like to free the deleted pages, but if the handle is
	 * read-only or if the application never modifies the tree, we're not
	 * able to do so.)
	 */
	if (btree->modified) {
		WT_RET(__wt_page_modify_init(session, page));
		__wt_page_modify_set(session, page);
	}

	/*
	 * An operation is accessing a "deleted" page, and we're building an
	 * in-memory version of the page (making it look like all entries in
	 * the page were individually updated by a remove operation).  There
	 * are two cases where we end up here:
	 *
	 * First, a running transaction used a truncate call to delete the page
	 * without reading it, in which case the page reference includes a
	 * structure with a transaction ID; the page we're building might split
	 * in the future, so we update that structure to include references to
	 * all of the update structures we create, so the transaction can abort.
	 *
	 * Second, a truncate call deleted a page and the truncate committed,
	 * but an older transaction in the system forced us to keep the old
	 * version of the page around, then we crashed and recovered, and now
	 * we're being forced to read that page.
	 *
	 * In the first case, we have a page reference structure, in the second
	 * second, we don't.
	 *
	 * Allocate the per-reference update array; in the case of instantiating
	 * a page, deleted by a running transaction that might eventually abort,
	 * we need a list of the update structures so we can do that abort.  The
	 * hard case is if a page splits: the update structures might be moved
	 * to different pages, and we still have to find them all for an abort.
	 */

	if (page_del != NULL)
		WT_RET(__wt_calloc_def(
		    session, page->pg_row_entries + 1, &page_del->update_list));

	/* Allocate the per-page update array. */
	WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array));
	page->pg_row_upd = upd_array;

	/*
	 * Fill in the per-reference update array with references to update
	 * structures, fill in the per-page update array with references to
	 * deleted items.
	 */
	for (i = 0, size = 0; i < page->pg_row_entries; ++i) {
		WT_ERR(__wt_calloc_one(session, &upd));
		WT_UPDATE_DELETED_SET(upd);

		if (page_del == NULL)
			upd->txnid = WT_TXN_NONE;	/* Globally visible */
		else {
			upd->txnid = page_del->txnid;
			page_del->update_list[i] = upd;
		}

		upd->next = upd_array[i];
		upd_array[i] = upd;

		size += sizeof(WT_UPDATE *) + WT_UPDATE_MEMSIZE(upd);
	}

	__wt_cache_page_inmem_incr(session, page, size);

	return (0);

err:	/*
	 * There's no need to free the page update structures on error, our
	 * caller will discard the page and do that work for us.  We could
	 * similarly leave the per-reference update array alone because it
	 * won't ever be used by any page that's not in-memory, but cleaning
	 * it up makes sense, especially if we come back in to this function
	 * attempting to instantiate this page again.
	 */
	if (page_del != NULL)
		__wt_free(session, page_del->update_list);
	return (ret);
}