Exemple #1
0
/*
 * __bloom_init --
 *	Allocate a WT_BLOOM handle.
 */
static int
__bloom_init(WT_SESSION_IMPL *session,
    const char *uri, const char *config, WT_BLOOM **bloomp)
{
	WT_BLOOM *bloom;
	WT_DECL_RET;
	size_t len;

	bloom = NULL;
	WT_ERR(__wt_calloc(session, 1, sizeof(WT_BLOOM), &bloom));
	WT_ERR(__wt_strdup(session, uri, &bloom->uri));
	WT_ERR(__wt_strdup(session, config, &bloom->config));
	len = strlen(WT_BLOOM_TABLE_CONFIG) + 2;
	if (config != NULL)
		len += strlen(config);
	WT_ERR(__wt_calloc(session, len, sizeof(char), &bloom->config));
	/* Add the standard config at the end, so it overrides user settings. */
	(void)snprintf(bloom->config, len,
	    "%s,%s", config == NULL ? "" : config, WT_BLOOM_TABLE_CONFIG);

	bloom->session = session;

	*bloomp = bloom;
	return (0);

err:	if (bloom->uri != NULL)
		__wt_free(session, bloom->uri);
	if (bloom->config != NULL)
		__wt_free(session, bloom->config);
	if (bloom->bitstring != NULL)
		__wt_free(session, bloom->bitstring);
	if (bloom != NULL)
		__wt_free(session, bloom);
	return (ret);
}
Exemple #2
0
/*
 * __wt_update_alloc --
 *	Allocate a WT_UPDATE structure and associated value and fill it in.
 */
int
__wt_update_alloc(WT_SESSION_IMPL *session,
    WT_ITEM *value, WT_UPDATE **updp, size_t *sizep)
{
	WT_UPDATE *upd;
	size_t size;

	/*
	 * Allocate the WT_UPDATE structure and room for the value, then copy
	 * the value into place.
	 */
	size = value == NULL ? 0 : value->size;
	WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd));
	if (value == NULL)
		WT_UPDATE_DELETED_SET(upd);
	else {
		upd->size = WT_STORE_SIZE(size);
		memcpy(WT_UPDATE_DATA(upd), value->data, size);
	}

	*updp = upd;
	if (sizep != NULL)
		*sizep = sizeof(WT_UPDATE) + size;
	return (0);
}
Exemple #3
0
/*
 * __wt_nfilename --
 *	Build a file name in a scratch buffer.  If the name is already an
 *	absolute path duplicate it, otherwise generate a path relative to the
 *	connection home directory.
 */
int
__wt_nfilename(
    WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path)
{
	size_t len;
	char *buf;

	*path = NULL;

	/*
	 * Needs to work with a NULL session handle - since this is called via
	 * the exists API which is used by the test utilities.
	 */
	if (session == NULL || __wt_absolute_path(name))
		WT_RET(__wt_strndup(session, name, namelen, path));
	else {
		len = strlen(S2C(session)->home) + 1 + namelen + 1;
		WT_RET(__wt_calloc(session, 1, len, &buf));
		snprintf(buf, len, "%s%s%.*s", S2C(session)->home,
		    __wt_path_separator(), (int)namelen, name);
		*path = buf;
	}

	return (0);
}
Exemple #4
0
/*
 * __wt_cond_alloc --
 *	Allocate and initialize a condition variable.
 */
int
__wt_cond_alloc(WT_SESSION_IMPL *session,
    const char *name, int is_signalled, WT_CONDVAR **condp)
{
	WT_CONDVAR *cond;

	/*
	 * !!!
	 * This function MUST handle a NULL session handle.
	 */
	WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond));

	/* Initialize the mutex. */
	if (pthread_mutex_init(&cond->mtx, NULL) != 0)
		goto err;

	/* Initialize the condition variable to permit self-blocking. */
	if (pthread_cond_init(&cond->cond, NULL) != 0)
		goto err;

	cond->name = name;
	cond->signalled = is_signalled;

	*condp = cond;
	return (0);

err:	__wt_free(session, cond);
	return (WT_ERROR);
}
Exemple #5
0
/*
 * __ckpt_extlist_read --
 *	Read a checkpoints extent lists and copy
 */
static int
__ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
{
	WT_BLOCK_CKPT *ci;

	/*
	 * Allocate a checkpoint structure, crack the cookie and read the
	 * checkpoint's extent lists.
	 *
	 * Ignore the avail list: checkpoint avail lists are only useful if we
	 * are rolling forward from the particular checkpoint and they represent
	 * our best understanding of what blocks can be allocated.  If we are
	 * not operating on the live checkpoint, subsequent checkpoints might
	 * have allocated those blocks, and the avail list is useless.  We don't
	 * discard it, because it is useful as part of verification, but we
	 * don't re-write it either.
	 */
	WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));

	ci = ckpt->bpriv;
	WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
	WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
	WT_RET(__wt_block_extlist_read(
	    session, block, &ci->alloc, ci->file_size));
	WT_RET(__wt_block_extlist_read(
	    session, block, &ci->discard, ci->file_size));

	return (0);
}
Exemple #6
0
/*
 * __wt_ovfl_txnc_add --
 *	Add a new entry to the page's list of transaction-cached overflow
 * records.
 */
int
__wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page,
    const uint8_t *addr, size_t addr_size,
    const void *value, size_t value_size)
{
	WT_OVFL_TXNC **head, **stack[WT_SKIP_MAXDEPTH], *txnc;
	size_t size;
	u_int i, skipdepth;
	uint8_t *p;

	if (page->modify->ovfl_track == NULL)
		WT_RET(__ovfl_track_init(session, page));

	head = page->modify->ovfl_track->ovfl_txnc;

	/* Choose a skiplist depth for this insert. */
	skipdepth = __wt_skip_choose_depth(session);

	/*
	 * Allocate the WT_OVFL_TXNC structure, next pointers for the skip
	 * list, room for the address and value, then copy everything into
	 * place.
	 *
	 * To minimize the WT_OVFL_TXNC structure size, the address offset
	 * and size are single bytes: that's safe because the address follows
	 * the structure (which can't be more than about 100B), and address
	 * cookies are limited to 255B.
	 */
	size = sizeof(WT_OVFL_TXNC) +
	    skipdepth * sizeof(WT_OVFL_TXNC *) + addr_size + value_size;
	WT_RET(__wt_calloc(session, 1, size, &txnc));
	p = (uint8_t *)txnc +
	    sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *);
	txnc->addr_offset = (uint8_t)WT_PTRDIFF(p, txnc);
	txnc->addr_size = (uint8_t)addr_size;
	memcpy(p, addr, addr_size);
	p += addr_size;
	txnc->value_offset = WT_PTRDIFF32(p, txnc);
	txnc->value_size = WT_STORE_SIZE(value_size);
	memcpy(p, value, value_size);
	txnc->current = __wt_txn_new_id(session);

	__wt_cache_page_inmem_incr(
	    session, page, WT_OVFL_SIZE(txnc, WT_OVFL_TXNC));

	/* Insert the new entry into the skiplist. */
	__ovfl_txnc_skip_search_stack(head, stack, addr, addr_size);
	for (i = 0; i < skipdepth; ++i) {
		txnc->next[i] = *stack[i];
		*stack[i] = txnc;
	}

	if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
		WT_RET(__ovfl_txnc_verbose(session, page, txnc, "add"));

	return (0);
}
Exemple #7
0
/*
 * __rec_page_dirty_update --
 *	Update a dirty page's reference on eviction.
 */
static int
__rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_ADDR *addr;
	WT_PAGE_MODIFY *mod;
	WT_REF *parent_ref;

	mod = page->modify;
	parent_ref = page->ref;

	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
	case WT_PM_REC_REPLACE: 			/* 1-for-1 page swap */
		if (parent_ref->addr != NULL &&
		    __wt_off_page(page->parent, parent_ref->addr)) {
			__wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr);
			__wt_free(session, parent_ref->addr);
		}

		/*
		 * Update the parent to reference the replacement page.
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
		*addr = mod->u.replace;
		mod->u.replace.addr = NULL;
		mod->u.replace.size = 0;

		parent_ref->page = NULL;
		parent_ref->addr = addr;
		WT_PUBLISH(parent_ref->state, WT_REF_DISK);
		break;
	case WT_PM_REC_SPLIT:				/* Page split */
		/*
		 * Update the parent to reference new internal page(s).
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		parent_ref->page = mod->u.split;
		WT_PUBLISH(parent_ref->state, WT_REF_MEM);

		/* Clear the reference else discarding the page will free it. */
		mod->u.split = NULL;
		F_CLR(mod, WT_PM_REC_SPLIT);
		break;
	case WT_PM_REC_EMPTY:				/* Page is empty */
		/* We checked if the page was empty when we reviewed it. */
		/* FALLTHROUGH */
	WT_ILLEGAL_VALUE(session);
	}

	return (0);
}
Exemple #8
0
/*
 * __curjoin_open_main --
 *	For the given index, open the main file with a projection
 *	that is the index keys.
 */
static int
__curjoin_open_main(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    WT_CURSOR_JOIN_ENTRY *entry)
{
	WT_DECL_RET;
	WT_INDEX *idx;
	size_t len, newsize;
	char *main_uri, *newformat;
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), "raw", NULL };

	main_uri = newformat = NULL;
	idx = entry->index;

	newsize = strlen(cjoin->table->iface.name) + idx->colconf.len + 1;
	WT_ERR(__wt_calloc(session, 1, newsize, &main_uri));
	WT_ERR(__wt_snprintf(main_uri, newsize, "%s%.*s",
	    cjoin->table->iface.name, (int)idx->colconf.len, idx->colconf.str));
	WT_ERR(__wt_open_cursor(session, main_uri,
	    (WT_CURSOR *)cjoin, raw_cfg, &entry->main));
	if (idx->extractor == NULL) {
		/*
		 * Add no-op padding so trailing 'u' formats are not
		 * transformed to 'U'.  This matches what happens in
		 * the index.  We don't do this when we have an
		 * extractor, extractors already use the padding
		 * byte trick.
		 */
		len = strlen(entry->main->value_format) + 3;
		WT_ERR(__wt_calloc(session, len, 1, &newformat));
		WT_ERR(__wt_snprintf(
		    newformat, len, "%s0x", entry->main->value_format));
		__wt_free(session, entry->main->value_format);
		entry->main->value_format = newformat;
		newformat = NULL;
	}

err:	__wt_free(session, main_uri);
	__wt_free(session, newformat);
	return (ret);
}
Exemple #9
0
/*
 * __wt_connection_open --
 *	Open a connection.
 */
int
__wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
{
	WT_SESSION_IMPL *session;

	/* Default session. */
	session = conn->default_session;
	WT_ASSERT(session, session->iface.connection == &conn->iface);

	/*
	 * Tell internal server threads to run: this must be set before opening
	 * any sessions.
	 */
	F_SET(conn, WT_CONN_SERVER_RUN | WT_CONN_LOG_SERVER_RUN);

	/* WT_SESSION_IMPL array. */
	WT_RET(__wt_calloc(session,
	    conn->session_size, sizeof(WT_SESSION_IMPL), &conn->sessions));
	WT_CACHE_LINE_ALIGNMENT_VERIFY(session, conn->sessions);

	/*
	 * Open the default session.  We open this before starting service
	 * threads because those may allocate and use session resources that
	 * need to get cleaned up on close.
	 */
	WT_RET(__wt_open_internal_session(
	    conn, "connection", false, 0, &session));

	/*
	 * The connection's default session is originally a static structure,
	 * swap that out for a more fully-functional session.  It's necessary
	 * to have this step: the session allocation code uses the connection's
	 * session, and if we pass a reference to the default session as the
	 * place to store the allocated session, things get confused and error
	 * handling can be corrupted.  So, we allocate into a stack variable
	 * and then assign it on success.
	 */
	conn->default_session = session;

	/*
	 * Publish: there must be a barrier to ensure the connection structure
	 * fields are set before other threads read from the pointer.
	 */
	WT_WRITE_BARRIER();

	/* Create the cache. */
	WT_RET(__wt_cache_create(session, cfg));

	/* Initialize transaction support. */
	WT_RET(__wt_txn_global_init(session, cfg));

	return (0);
}
Exemple #10
0
/*
 * __wt_block_ext_prealloc --
 *	Pre-allocate WT_EXT and WT_SIZE structures.
 */
int
__wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max)
{
	if (session->block_manager == NULL) {
		WT_RET(__wt_calloc(session, 1,
		    sizeof(WT_BLOCK_MGR_SESSION), &session->block_manager));
		session->block_manager_cleanup =
		    __block_manager_session_cleanup;
	}
	WT_RET(__block_ext_prealloc(session, max));
	WT_RET(__block_size_prealloc(session, max));
	return (0);
}
Exemple #11
0
/*
 * __curjoin_entry_iter_init --
 *	Initialize an iteration for the index managed by a join entry.
 *
 */
static int
__curjoin_entry_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    WT_CURSOR_JOIN_ENTRY *entry, WT_CURSOR_JOIN_ITER **iterp)
{
	WT_CURSOR *newcur;
	WT_CURSOR *to_dup;
	WT_DECL_RET;
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), "raw", NULL };
	const char *def_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), NULL };
	const char *uri, **config;
	char *uribuf;
	WT_CURSOR_JOIN_ITER *iter;
	size_t size;

	iter = NULL;
	uribuf = NULL;
	to_dup = entry->ends[0].cursor;

	uri = to_dup->uri;
	if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW))
		config = &raw_cfg[0];
	else
		config = &def_cfg[0];

	if (cjoin->projection != NULL) {
		size = strlen(uri) + strlen(cjoin->projection) + 1;
		WT_ERR(__wt_calloc(session, size, 1, &uribuf));
		snprintf(uribuf, size, "%s%s", uri, cjoin->projection);
		uri = uribuf;
	}
	WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)cjoin, config,
	    &newcur));
	WT_ERR(__wt_cursor_dup_position(to_dup, newcur));
	WT_ERR(__wt_calloc_one(session, &iter));
	iter->cjoin = cjoin;
	iter->session = session;
	iter->entry = entry;
	iter->cursor = newcur;
	iter->advance = false;
	*iterp = iter;

	if (0) {
err:		__wt_free(session, iter);
	}
	__wt_free(session, uribuf);
	return (ret);
}
Exemple #12
0
/*
 * __block_ext_alloc --
 *	Allocate a new WT_EXT structure.
 */
static int
__block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp)
{
	WT_EXT *ext;

	size_t skipdepth;

	skipdepth = __wt_skip_choose_depth(session);
	WT_RET(__wt_calloc(session, 1,
	    sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), &ext));
	ext->depth = (uint8_t)skipdepth;
	(*extp) = ext;

	return (0);
}
Exemple #13
0
/*
 * __wt_curmetadata_open --
 *	WT_SESSION->open_cursor method for metadata cursors.
 *
 * Metadata cursors are a similar to a file cursor on the special metadata
 * table, except that the metadata for the metadata table (which is stored
 * in the turtle file) can also be queried.
 *
 * Metadata cursors are read-only default.
 */
int
__wt_curmetadata_open(WT_SESSION_IMPL *session,
    const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    NULL,			/* get-key */
	    NULL,			/* get-value */
	    NULL,			/* set-key */
	    NULL,			/* set-value */
	    __curmetadata_compare,	/* compare */
	    __curmetadata_next,		/* next */
	    __curmetadata_prev,		/* prev */
	    __curmetadata_reset,	/* reset */
	    __curmetadata_search,	/* search */
	    __curmetadata_search_near,	/* search-near */
	    __curmetadata_insert,	/* insert */
	    __curmetadata_update,	/* update */
	    __curmetadata_remove,	/* remove */
	    __curmetadata_close);	/* close */
	WT_CURSOR *cursor;
	WT_CURSOR_METADATA *mdc;
	WT_DECL_RET;

	WT_RET(__wt_calloc(session, 1, sizeof(WT_CURSOR_METADATA), &mdc));

	cursor = &mdc->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	cursor->key_format = "S";
	cursor->value_format = "S";

	/* Open the file cursor for operations on the regular metadata */
	WT_ERR(__wt_metadata_cursor(session, cfg[1], &mdc->file_cursor));

	WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));

	/* Metadata cursors default to read only. */
	WT_ERR(__wt_cursor_config_readonly(cursor, cfg, 1));

	if (0) {
err:		__wt_free(session, mdc);
	}
	return (ret);
}
Exemple #14
0
/*
 * __col_insert_alloc --
 *	Column-store insert: allocate a WT_INSERT structure and fill it in.
 */
static int
__col_insert_alloc(WT_SESSION_IMPL *session,
    uint64_t recno, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep)
{
	WT_INSERT *ins;
	size_t ins_size;

	/*
	 * Allocate the WT_INSERT structure and skiplist pointers, then copy
	 * the record number into place.
	 */
	ins_size = sizeof(WT_INSERT) + skipdepth * sizeof(WT_INSERT *);
	WT_RET(__wt_calloc(session, 1, ins_size, &ins));

	WT_INSERT_RECNO(ins) = recno;

	*insp = ins;
	*ins_sizep = ins_size;
	return (0);
}
Exemple #15
0
/*
 * __wt_strndup --
 *	Duplicate a byte string of a given length (and NUL-terminate).
 */
int
__wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp)
{
	void *p;

	if (str == NULL) {
		*(void **)retp = NULL;
		return (0);
	}

	WT_RET(__wt_calloc(session, len + 1, 1, &p));

	/*
	 * Don't change this to strncpy, we rely on this function to duplicate
	 * "strings" that contain nul bytes.
	 */
	memcpy(p, str, len);

	*(void **)retp = p;
	return (0);
}
Exemple #16
0
/*
 * __wt_rwlock_alloc --
 *	Allocate and initialize a read/write lock.
 */
int
__wt_rwlock_alloc(
    WT_SESSION_IMPL *session, const char *name, WT_RWLOCK **rwlockp)
{
	WT_DECL_RET;
	WT_RWLOCK *rwlock;

	WT_RET(__wt_calloc(session, 1, sizeof(WT_RWLOCK), &rwlock));
	WT_ERR_TEST(pthread_rwlock_init(&rwlock->rwlock, NULL), WT_ERROR);

	rwlock->name = name;
	*rwlockp = rwlock;

	WT_VERBOSE_ERR(session, mutex,
	    "rwlock: alloc %s (%p)", rwlock->name, rwlock);

	if (0) {
err:		__wt_free(session, rwlock);
	}
	return (ret);
}
Exemple #17
0
/*
 * __wt_getenv --
 * 	Get a non-NULL, greater than zero-length environment variable.
 */
int
__wt_getenv(WT_SESSION_IMPL *session, const char *variable, const char **envp)
{
    WT_DECL_RET;
    DWORD size;

    *envp = NULL;

    size = GetEnvironmentVariableA(variable, NULL, 0);
    if (size <= 1)
        return (WT_NOTFOUND);

    WT_RET(__wt_calloc(session, 1, size, envp));

    ret = GetEnvironmentVariableA(variable, *envp, size);
    /* We expect the number of bytes not including nul terminator. */
    if ((ret + 1) != size)
        WT_RET_MSG(session, __wt_getlasterror(),
                   "GetEnvironmentVariableA failed: %s", variable);

    return (0);
}
Exemple #18
0
/*
 * __wt_cond_alloc --
 *	Allocate and initialize a condition variable.
 */
int
__wt_cond_alloc(WT_SESSION_IMPL *session,
    const char *name, int is_signalled, WT_CONDVAR **condp)
{
	WT_CONDVAR *cond;
	WT_DECL_RET;
	pthread_mutexattr_t *attrp;

	/* Initialize the mutex. */
#ifdef HAVE_MUTEX_ADAPTIVE
	pthread_mutexattr_t attr;

	WT_RET(pthread_mutexattr_init(&attr));
	WT_RET(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP));
	attrp = &attr;
#else
	attrp = NULL;
#endif

	/*
	 * !!!
	 * This function MUST handle a NULL session handle.
	 */
	WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond));

	WT_ERR(pthread_mutex_init(&cond->mtx, attrp));

	/* Initialize the condition variable to permit self-blocking. */
	WT_ERR(pthread_cond_init(&cond->cond, NULL));

	cond->name = name;
	cond->waiters = is_signalled ? -1 : 0;

	*condp = cond;
	return (0);

err:	__wt_free(session, cond);
	return (ret);
}
Exemple #19
0
/*
 * __wt_nfilename --
 *	Build a file name in a scratch buffer. If the name is already an
 *	absolute path duplicate it, otherwise generate a path relative to the
 *	connection home directory.
 */
int
__wt_nfilename(WT_SESSION_IMPL *session,
    const char *name, size_t namelen, const char **path)
{
	WT_CONNECTION_IMPL *conn;
	size_t len;
	char *buf;

	conn = S2C(session);
	*path = NULL;

	if (__wt_absolute_path(name))
		WT_RET(__wt_strndup(session, name, namelen, path));
	else {
		len = strlen(conn->home) + 1 + namelen + 1;
		WT_RET(__wt_calloc(session, 1, len, &buf));
		snprintf(buf, len, "%s/%.*s", conn->home, (int)namelen, name);
		*path = buf;
	}

	return (0);
}
Exemple #20
0
/*
 * __wt_row_insert_alloc --
 *	Row-store insert: allocate a WT_INSERT structure and fill it in.
 */
int
__wt_row_insert_alloc(WT_SESSION_IMPL *session,
    WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep)
{
	WT_INSERT *ins;
	size_t ins_size;

	/*
	 * Allocate the WT_INSERT structure, next pointers for the skip list,
	 * and room for the key.  Then copy the key into place.
	 */
	ins_size = sizeof(WT_INSERT) +
	    skipdepth * sizeof(WT_INSERT *) + key->size;
	WT_RET(__wt_calloc(session, 1, ins_size, &ins));

	ins->u.key.offset = WT_STORE_SIZE(ins_size - key->size);
	WT_INSERT_KEY_SIZE(ins) = key->size;
	memcpy(WT_INSERT_KEY(ins), key->data, key->size);

	*insp = ins;
	if (ins_sizep != NULL)
		*ins_sizep = ins_size;
	return (0);
}
Exemple #21
0
/*
 * __wt_update_alloc --
 *	Allocate a WT_UPDATE structure and associated value from the session's
 *	buffer and fill it in.
 */
int
__wt_update_alloc(WT_SESSION_IMPL *session,
    WT_ITEM *value, WT_UPDATE **updp, size_t *sizep)
{
	WT_DECL_RET;
	WT_UPDATE *upd;
	size_t size;

	/*
	 * Allocate the WT_UPDATE structure and room for the value, then copy
	 * the value into place.
	 */
	size = value == NULL ? 0 : value->size;
	WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd));
	if (value == NULL)
		WT_UPDATE_DELETED_SET(upd);
	else {
		upd->size = WT_STORE_SIZE(size);
		memcpy(WT_UPDATE_DATA(upd), value->data, size);
	}

	/*
	 * This must come last: after __wt_txn_modify succeeds, we must return
	 * a non-NULL upd so our callers can call __wt_txn_unmodify on any
	 * subsequent failure.
	 */
	if ((ret = __wt_txn_modify(session, &upd->txnid)) != 0) {
		__wt_free(session, upd);
		return (ret);
	}

	*updp = upd;
	if (sizep != NULL)
		*sizep = sizeof(WT_UPDATE) + size;
	return (0);
}
Exemple #22
0
/*
 * __curjoin_iter_set_entry --
 *	Set the current entry for an iterator.
 */
static int
__curjoin_iter_set_entry(WT_CURSOR_JOIN_ITER *iter, u_int entry_pos)
{
	WT_CURSOR *c, *to_dup;
	WT_CURSOR_JOIN *cjoin, *topjoin;
	WT_CURSOR_JOIN_ENTRY *entry;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	size_t size;
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    iter->session, WT_SESSION_open_cursor), "raw", NULL };
	const char *def_cfg[] = { WT_CONFIG_BASE(
	    iter->session, WT_SESSION_open_cursor), NULL };
	const char **config;
	char *uri;

	session = iter->session;
	cjoin = iter->cjoin;
	uri = NULL;
	entry = iter->entry = &cjoin->entries[entry_pos];
	iter->positioned = false;
	iter->entry_pos = entry_pos;
	iter->end_pos = 0;

	iter->is_equal = (entry->ends_next == 1 &&
	    WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_EQ);
	iter->end_skip = (entry->ends_next > 0 &&
	    WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_GE) ? 1 : 0;

	iter->end_count = WT_MIN(1, entry->ends_next);
	if (F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) {
		iter->entry_count = cjoin->entries_next;
		if (iter->is_equal)
			iter->end_count = entry->ends_next;
	} else
		iter->entry_count = 1;
	WT_ASSERT(iter->session, iter->entry_pos < iter->entry_count);

	entry->stats.iterated = 0;

	if (entry->subjoin == NULL) {
		for (topjoin = iter->cjoin; topjoin->parent != NULL;
		     topjoin = topjoin->parent)
			;
		to_dup = entry->ends[0].cursor;

		if (F_ISSET((WT_CURSOR *)topjoin, WT_CURSTD_RAW))
			config = &raw_cfg[0];
		else
			config = &def_cfg[0];

		size = strlen(to_dup->internal_uri) + 3;
		WT_ERR(__wt_calloc(session, size, 1, &uri));
		WT_ERR(__wt_snprintf(uri, size, "%s()", to_dup->internal_uri));
		if ((c = iter->cursor) == NULL || strcmp(c->uri, uri) != 0) {
			iter->cursor = NULL;
			if (c != NULL)
				WT_ERR(c->close(c));
			WT_ERR(__wt_open_cursor(session, uri,
			    (WT_CURSOR *)topjoin, config, &iter->cursor));
		}
		WT_ERR(__wt_cursor_dup_position(to_dup, iter->cursor));
	} else if (iter->cursor != NULL) {
		WT_ERR(iter->cursor->close(iter->cursor));
		iter->cursor = NULL;
	}

err:	__wt_free(session, uri);
	return (ret);
}
Exemple #23
0
/*
 * __wt_page_alloc --
 *	Create or read a page into the cache.
 */
int
__wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type,
    uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep)
{
	WT_CACHE *cache;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_INDEX *pindex;
	size_t size;
	uint32_t i;
	void *p;

	*pagep = NULL;

	cache = S2C(session)->cache;
	page = NULL;

	size = sizeof(WT_PAGE);
	switch (type) {
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_INT:
	case WT_PAGE_ROW_INT:
		break;
	case WT_PAGE_COL_VAR:
		/*
		 * Variable-length column-store leaf page: allocate memory to
		 * describe the page's contents with the initial allocation.
		 */
		size += alloc_entries * sizeof(WT_COL);
		break;
	case WT_PAGE_ROW_LEAF:
		/*
		 * Row-store leaf page: allocate memory to describe the page's
		 * contents with the initial allocation.
		 */
		size += alloc_entries * sizeof(WT_ROW);
		break;
	WT_ILLEGAL_VALUE(session);
	}

	WT_RET(__wt_calloc(session, 1, size, &page));

	page->type = type;
	page->read_gen = WT_READGEN_NOTSET;

	switch (type) {
	case WT_PAGE_COL_FIX:
		page->pg_fix_recno = recno;
		page->pg_fix_entries = alloc_entries;
		break;
	case WT_PAGE_COL_INT:
	case WT_PAGE_ROW_INT:
		page->pg_intl_recno = recno;

		/*
		 * Internal pages have an array of references to objects so they
		 * can split.  Allocate the array of references and optionally,
		 * the objects to which they point.
		 */
		WT_ERR(__wt_calloc(session, 1,
		    sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *),
		    &p));
		size +=
		    sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *);
		pindex = p;
		pindex->index = (WT_REF **)((WT_PAGE_INDEX *)p + 1);
		pindex->entries = alloc_entries;
		WT_INTL_INDEX_SET(page, pindex);
		if (alloc_refs)
			for (i = 0; i < pindex->entries; ++i) {
				WT_ERR(__wt_calloc_def(
				    session, 1, &pindex->index[i]));
				size += sizeof(WT_REF);
			}
		if (0) {
err:			if ((pindex = WT_INTL_INDEX_COPY(page)) != NULL) {
				for (i = 0; i < pindex->entries; ++i)
					__wt_free(session, pindex->index[i]);
				__wt_free(session, pindex);
			}
			__wt_free(session, page);
			return (ret);
		}
		break;
	case WT_PAGE_COL_VAR:
		page->pg_var_recno = recno;
		page->pg_var_d = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE));
		page->pg_var_entries = alloc_entries;
		break;
	case WT_PAGE_ROW_LEAF:
		page->pg_row_d = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE));
		page->pg_row_entries = alloc_entries;
		break;
	WT_ILLEGAL_VALUE(session);
	}

	/* Increment the cache statistics. */
	__wt_cache_page_inmem_incr(session, page, size);
	(void)WT_ATOMIC_ADD8(cache->pages_inmem, 1);

	*pagep = page;
	return (0);
}
Exemple #24
0
/*
 * __wt_curfile_create --
 *	Open a cursor for a given btree handle.
 */
int
__wt_curfile_create(WT_SESSION_IMPL *session,
    WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap,
    WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,	/* get-key */
	    __wt_cursor_get_value,	/* get-value */
	    __wt_cursor_set_key,	/* set-key */
	    __wt_cursor_set_value,	/* set-value */
	    __curfile_compare,		/* compare */
	    __curfile_equals,		/* equals */
	    __curfile_next,		/* next */
	    __curfile_prev,		/* prev */
	    __curfile_reset,		/* reset */
	    __curfile_search,		/* search */
	    __curfile_search_near,	/* search-near */
	    __curfile_insert,		/* insert */
	    __curfile_update,		/* update */
	    __curfile_remove,		/* remove */
	    __wt_cursor_reconfigure,	/* reconfigure */
	    __curfile_close);		/* close */
	WT_BTREE *btree;
	WT_CONFIG_ITEM cval;
	WT_CURSOR *cursor;
	WT_CURSOR_BTREE *cbt;
	WT_CURSOR_BULK *cbulk;
	WT_DECL_RET;
	size_t csize;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0);

	cbt = NULL;

	btree = S2BT(session);
	WT_ASSERT(session, btree != NULL);

	csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE);
	WT_RET(__wt_calloc(session, 1, csize, &cbt));

	cursor = &cbt->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	cursor->internal_uri = btree->dhandle->name;
	cursor->key_format = btree->key_format;
	cursor->value_format = btree->value_format;
	cbt->btree = btree;

	if (bulk) {
		F_SET(cursor, WT_CURSTD_BULK);

		cbulk = (WT_CURSOR_BULK *)cbt;

		/* Optionally skip the validation of each bulk-loaded key. */
		WT_ERR(__wt_config_gets_def(
		    session, cfg, "skip_sort_check", 0, &cval));
		WT_ERR(__wt_curbulk_init(
		    session, cbulk, bitmap, cval.val == 0 ? 0 : 1));
	}

	/*
	 * random_retrieval
	 * Random retrieval cursors only support next, reset and close.
	 */
	WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
	if (cval.val != 0) {
		__wt_cursor_set_notsup(cursor);
		cursor->next = __curfile_next_random;
		cursor->reset = __curfile_reset;
	}

	/* Underlying btree initialization. */
	__wt_btcur_open(cbt);

	/* __wt_cursor_init is last so we don't have to clean up on error. */
	WT_ERR(__wt_cursor_init(
	    cursor, cursor->internal_uri, owner, cfg, cursorp));

	WT_STAT_FAST_CONN_INCR(session, cursor_create);
	WT_STAT_FAST_DATA_INCR(session, cursor_create);

	if (0) {
err:		__wt_free(session, cbt);
	}

	return (ret);
}
Exemple #25
0
/*
 * __wt_curjoin_join --
 *	Add a new join to a join cursor.
 */
int
__wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range,
    uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count)
{
	WT_CURSOR_INDEX *cindex;
	WT_CURSOR_JOIN *child;
	WT_CURSOR_JOIN_ENDPOINT *end;
	WT_CURSOR_JOIN_ENTRY *entry;
	size_t len;
	uint8_t endrange;
	u_int i, ins, nonbloom;
	bool hasins, needbloom, nested, range_eq;

	entry = NULL;
	hasins = needbloom = false;
	ins = nonbloom = 0;				/* -Wuninitialized */

	if (cjoin->entries_next == 0) {
		if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION))
			F_SET(cjoin, WT_CURJOIN_DISJUNCTION);
	} else if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) &&
	    !F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION))
		WT_RET_MSG(session, EINVAL,
		    "operation=or does not match previous operation=and");
	else if (!LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) &&
	    F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION))
		WT_RET_MSG(session, EINVAL,
		    "operation=and does not match previous operation=or");

	nested = WT_PREFIX_MATCH(ref_cursor->uri, "join:");
	if (!nested)
		for (i = 0; i < cjoin->entries_next; i++) {
			if (cjoin->entries[i].index == idx &&
			    cjoin->entries[i].subjoin == NULL) {
				entry = &cjoin->entries[i];
				break;
			}
			if (!needbloom && i > 0 &&
			    !F_ISSET(&cjoin->entries[i],
			    WT_CURJOIN_ENTRY_BLOOM)) {
				needbloom = true;
				nonbloom = i;
			}
		}
	else {
		if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM))
			WT_RET_MSG(session, EINVAL,
			    "Bloom filters cannot be used with subjoins");
	}

	if (entry == NULL) {
		WT_RET(__wt_realloc_def(session, &cjoin->entries_allocated,
		    cjoin->entries_next + 1, &cjoin->entries));
		if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) {
			/*
			 * Reorder the list so that after the first entry,
			 * the Bloom filtered entries come next, followed by
			 * the non-Bloom entries.  Once the Bloom filters
			 * are built, determining membership via Bloom is
			 * faster than without Bloom, so we can answer
			 * membership questions more quickly, and with less
			 * I/O, with the Bloom entries first.
			 */
			entry = &cjoin->entries[nonbloom];
			memmove(entry + 1, entry,
			    (cjoin->entries_next - nonbloom) *
			    sizeof(WT_CURSOR_JOIN_ENTRY));
			memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY));
		}
		else
			entry = &cjoin->entries[cjoin->entries_next];
		entry->index = idx;
		entry->flags = flags;
		entry->count = count;
		entry->bloom_bit_count = bloom_bit_count;
		entry->bloom_hash_count = bloom_hash_count;
		++cjoin->entries_next;
	} else {
		/* Merge the join into an existing entry for this index */
		if (count != 0 && entry->count != 0 && entry->count != count)
			WT_RET_MSG(session, EINVAL,
			    "count=%" PRIu64 " does not match "
			    "previous count=%" PRIu64 " for this index",
			    count, entry->count);
		if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) !=
		    F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM))
			WT_RET_MSG(session, EINVAL,
			    "join has incompatible strategy "
			    "values for the same index");
		if (LF_MASK(WT_CURJOIN_ENTRY_FALSE_POSITIVES) !=
		    F_MASK(entry, WT_CURJOIN_ENTRY_FALSE_POSITIVES))
			WT_RET_MSG(session, EINVAL,
			    "join has incompatible bloom_false_positives "
			    "values for the same index");

		/*
		 * Check against other comparisons (we call them endpoints)
		 * already set up for this index.
		 * We allow either:
		 *   - one or more "eq" (with disjunction)
		 *   - exactly one "eq" (with conjunction)
		 *   - exactly one of "gt" or "ge" (conjunction or disjunction)
		 *   - exactly one of "lt" or "le" (conjunction or disjunction)
		 *   - one of "gt"/"ge" along with one of "lt"/"le"
		 *         (currently restricted to conjunction).
		 *
		 * Some other combinations, although expressible either do
		 * not make sense (X == 3 AND X == 5) or are reducible (X <
		 * 7 AND X < 9).  Other specific cases of (X < 7 OR X > 15)
		 * or (X == 4 OR X > 15) make sense but we don't handle yet.
		 */
		for (i = 0; i < entry->ends_next; i++) {
			end = &entry->ends[i];
			range_eq = (range == WT_CURJOIN_END_EQ);
			endrange = WT_CURJOIN_END_RANGE(end);
			if ((F_ISSET(end, WT_CURJOIN_END_GT) &&
			    ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) ||
			    (F_ISSET(end, WT_CURJOIN_END_LT) &&
			    ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) ||
			    (endrange == WT_CURJOIN_END_EQ &&
			    (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT))
			    != 0))
				WT_RET_MSG(session, EINVAL,
				    "join has overlapping ranges");
			if (range == WT_CURJOIN_END_EQ &&
			    endrange == WT_CURJOIN_END_EQ &&
			    !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION))
				WT_RET_MSG(session, EINVAL,
				    "compare=eq can only be combined "
				    "using operation=or");

			/*
			 * Sort "gt"/"ge" to the front, followed by any number
			 * of "eq", and finally "lt"/"le".
			 */
			if (!hasins &&
			    ((range & WT_CURJOIN_END_GT) != 0 ||
			    (range == WT_CURJOIN_END_EQ &&
			    endrange != WT_CURJOIN_END_EQ &&
			    !F_ISSET(end, WT_CURJOIN_END_GT)))) {
				ins = i;
				hasins = true;
			}
		}
		/* All checks completed, merge any new configuration now */
		entry->count = count;
		entry->bloom_bit_count =
		    WT_MAX(entry->bloom_bit_count, bloom_bit_count);
		entry->bloom_hash_count =
		    WT_MAX(entry->bloom_hash_count, bloom_hash_count);
	}
	if (nested) {
		child = (WT_CURSOR_JOIN *)ref_cursor;
		entry->subjoin = child;
		child->parent = cjoin;
	} else {
		WT_RET(__curjoin_insert_endpoint(session, entry,
		    hasins ? ins : entry->ends_next, &end));
		end->cursor = ref_cursor;
		F_SET(end, range);

		if (entry->main == NULL && idx != NULL) {
			/*
			 * Open the main file with a projection of the
			 * indexed columns.
			 */
			WT_RET(__curjoin_open_main(session, cjoin, entry));

			/*
			 * When we are repacking index keys to remove the
			 * primary key, we never want to transform trailing
			 * 'u'.  Use no-op padding to force this.
			 */
			cindex = (WT_CURSOR_INDEX *)ref_cursor;
			len = strlen(cindex->iface.key_format) + 3;
			WT_RET(__wt_calloc(session, len, 1,
			    &entry->repack_format));
			WT_RET(__wt_snprintf(entry->repack_format,
			    len, "%s0x", cindex->iface.key_format));
		}
	}
	return (0);
}
Exemple #26
0
/*
 * __wt_page_alloc --
 *	Create or read a page into the cache.
 */
int
__wt_page_alloc(WT_SESSION_IMPL *session,
    uint8_t type, uint32_t alloc_entries, WT_PAGE **pagep)
{
	WT_CACHE *cache;
	WT_PAGE *page;
	size_t size;
	void *p;

	*pagep = NULL;

	cache = S2C(session)->cache;

	/*
	 * Allocate a page, and for most page types, the additional information
	 * it needs to describe the disk image.
	 */
	size = sizeof(WT_PAGE);
	switch (type) {
	case WT_PAGE_COL_FIX:
		break;
	case WT_PAGE_COL_INT:
	case WT_PAGE_ROW_INT:
		size += alloc_entries * sizeof(WT_REF);
		break;
	case WT_PAGE_COL_VAR:
		size += alloc_entries * sizeof(WT_COL);
		break;
	case WT_PAGE_ROW_LEAF:
		size += alloc_entries * sizeof(WT_ROW);
		break;
	WT_ILLEGAL_VALUE(session);
	}

	WT_RET(__wt_calloc(session, 1, size, &page));
	p = (uint8_t *)page + sizeof(WT_PAGE);

	switch (type) {
	case WT_PAGE_COL_FIX:
		break;
	case WT_PAGE_COL_INT:
	case WT_PAGE_ROW_INT:
		page->u.intl.t = p;
		break;
	case WT_PAGE_COL_VAR:
		page->u.col_var.d = p;
		break;
	case WT_PAGE_ROW_LEAF:
		page->u.row.d = p;
		break;
	WT_ILLEGAL_VALUE(session);
	}

	/* Increment the cache statistics. */
	__wt_cache_page_inmem_incr(session, page, size);
	(void)WT_ATOMIC_ADD(cache->pages_inmem, 1);

	/* The one page field we set is the type. */
	page->type = type;

	*pagep = page;
	return (0);
}
Exemple #27
0
/*
 * __wt_open --
 *	Open a file handle.
 */
int
__wt_open(WT_SESSION_IMPL *session,
    const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp)
{
	DWORD dwCreationDisposition;
	HANDLE filehandle, filehandle_secondary;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *fh, *tfh;
	int direct_io, f, matched, share_mode;
	char *path;

	conn = S2C(session);
	fh = NULL;
	path = NULL;
	filehandle = INVALID_HANDLE_VALUE;
	filehandle_secondary = INVALID_HANDLE_VALUE;
	direct_io = 0;

	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name));

	/* Increment the reference count if we already have the file open. */
	matched = 0;
	__wt_spin_lock(session, &conn->fh_lock);
	TAILQ_FOREACH(tfh, &conn->fhqh, q)
		if (strcmp(name, tfh->name) == 0) {
			++tfh->ref;
			*fhp = tfh;
			matched = 1;
			break;
		}
	__wt_spin_unlock(session, &conn->fh_lock);
	if (matched)
		return (0);

	/* For directories, create empty file handles with invalid handles */
	if (dio_type == WT_FILE_TYPE_DIRECTORY) {
		goto setupfh;
	}

	WT_RET(__wt_filename(session, name, &path));

	share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
	/*
	 * Security:
	 * The application may spawn a new process, and we don't want another
	 * process to have access to our file handles.
	 *
	 * TODO: Set tighter file permissions but set bInheritHandle to false
	 * to prevent inheritance
	 */

	f = FILE_ATTRIBUTE_NORMAL;

	dwCreationDisposition = 0;
	if (ok_create) {
		dwCreationDisposition = CREATE_NEW;
		if (exclusive)
			dwCreationDisposition = CREATE_ALWAYS;
	} else
		dwCreationDisposition = OPEN_EXISTING;

	if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) {
		f |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
		direct_io = 1;
	}

	if (dio_type == WT_FILE_TYPE_LOG &&
	    FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) {
		f |= FILE_FLAG_WRITE_THROUGH;
	}

	/* Disable read-ahead on trees: it slows down random read workloads. */
	if (dio_type == WT_FILE_TYPE_DATA ||
	    dio_type == WT_FILE_TYPE_CHECKPOINT)
		f |= FILE_FLAG_RANDOM_ACCESS;

	filehandle = CreateFileA(path,
				(GENERIC_READ | GENERIC_WRITE),
				share_mode,
				NULL,
				dwCreationDisposition,
				f,
				NULL);
	if (filehandle == INVALID_HANDLE_VALUE) {
		if (GetLastError() == ERROR_FILE_EXISTS && ok_create)
			filehandle = CreateFileA(path,
						(GENERIC_READ | GENERIC_WRITE),
						share_mode,
						NULL,
						OPEN_EXISTING,
						f,
						NULL);

		if (filehandle == INVALID_HANDLE_VALUE)
			WT_ERR_MSG(session, __wt_errno(),
			    direct_io ?
			    "%s: open failed with direct I/O configured, some "
			    "filesystem types do not support direct I/O" :
			    "%s", path);
	}

	/*
	 * Open a second handle to file to support allocation/truncation
	 * concurrently with reads on the file. Writes would also move the file
	 * pointer.
	 */
	filehandle_secondary = CreateFileA(path,
	    (GENERIC_READ | GENERIC_WRITE),
	    share_mode,
	    NULL,
	    OPEN_EXISTING,
	    f,
	    NULL);
	if (filehandle == INVALID_HANDLE_VALUE)
		WT_ERR_MSG(session, __wt_errno(),
		    "open failed for secondary handle: %s", path);

setupfh:
	WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh));
	WT_ERR(__wt_strdup(session, name, &fh->name));
	fh->filehandle = filehandle;
	fh->filehandle_secondary = filehandle_secondary;
	fh->ref = 1;
	fh->direct_io = direct_io;

	/* Set the file's size. */
	if (dio_type != WT_FILE_TYPE_DIRECTORY)
		WT_ERR(__wt_filesize(session, fh, &fh->size));

	/* Configure file extension. */
	if (dio_type == WT_FILE_TYPE_DATA ||
	    dio_type == WT_FILE_TYPE_CHECKPOINT)
		fh->extend_len = conn->data_extend_len;

	/* Configure fallocate/posix_fallocate calls. */
	__wt_fallocate_config(session, fh);

	/*
	 * Repeat the check for a match, but then link onto the database's list
	 * of files.
	 */
	matched = 0;
	__wt_spin_lock(session, &conn->fh_lock);
	TAILQ_FOREACH(tfh, &conn->fhqh, q)
		if (strcmp(name, tfh->name) == 0) {
			++tfh->ref;
			*fhp = tfh;
			matched = 1;
			break;
		}
	if (!matched) {
		TAILQ_INSERT_TAIL(&conn->fhqh, fh, q);
		WT_STAT_FAST_CONN_INCR(session, file_open);

		*fhp = fh;
	}
	__wt_spin_unlock(session, &conn->fh_lock);
	if (matched) {
err:		if (fh != NULL) {
			__wt_free(session, fh->name);
			__wt_free(session, fh);
		}
		if (filehandle != INVALID_HANDLE_VALUE)
			(void)CloseHandle(filehandle);
		if (filehandle_secondary != INVALID_HANDLE_VALUE)
			(void)CloseHandle(filehandle_secondary);
	}

	__wt_free(session, path);
	return (ret);
}
Exemple #28
0
/*
 * __curjoin_init_next --
 *	Initialize the cursor join when the next function is first called.
 */
static int
__curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    bool iterable)
{
	WT_BLOOM *bloom;
	WT_CURSOR *origcur;
	WT_CURSOR_JOIN_ENDPOINT *end;
	WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2;
	WT_DECL_RET;
	size_t size;
	uint32_t f, k;
	char *mainbuf;
	const char *def_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), NULL };
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), "raw", NULL };
	const char **config, *proj, *urimain;

	mainbuf = NULL;
	if (cjoin->entries_next == 0)
		WT_RET_MSG(session, EINVAL,
		    "join cursor has not yet been joined with any other "
		    "cursors");

	/* Get a consistent view of our subordinate cursors if appropriate. */
	__wt_txn_cursor_op(session);

	if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW))
		config = &raw_cfg[0];
	else
		config = &def_cfg[0];
	urimain = cjoin->table->iface.name;
	if ((proj = cjoin->projection) != NULL) {
		size = strlen(urimain) + strlen(proj) + 1;
		WT_ERR(__wt_calloc(session, size, 1, &mainbuf));
		WT_ERR(__wt_snprintf(mainbuf, size, "%s%s", urimain, proj));
		urimain = mainbuf;
	}
	WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config,
	    &cjoin->main));

	jeend = &cjoin->entries[cjoin->entries_next];
	for (je = cjoin->entries; je < jeend; je++) {
		if (je->subjoin != NULL) {
			WT_ERR(__curjoin_init_next(session, je->subjoin,
			    iterable));
			continue;
		}
		__wt_stat_join_init_single(&je->stats);
		/*
		 * For a single compare=le/lt endpoint in any entry that may
		 * be iterated, construct a companion compare=ge endpoint
		 * that will actually be iterated.
		 */
		if (iterable && je->ends_next == 1 &&
		    F_ISSET(&je->ends[0], WT_CURJOIN_END_LT)) {
			origcur = je->ends[0].cursor;
			WT_ERR(__curjoin_insert_endpoint(session, je, 0, &end));
			WT_ERR(__wt_open_cursor(session, origcur->uri,
			    (WT_CURSOR *)cjoin,
			    F_ISSET(origcur, WT_CURSTD_RAW) ? raw_cfg : def_cfg,
			    &end->cursor));
			end->flags = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ |
			    WT_CURJOIN_END_OWN_CURSOR;
			WT_ERR(end->cursor->next(end->cursor));
			F_CLR(je, WT_CURJOIN_ENTRY_DISJUNCTION);
		}
		for (end = &je->ends[0]; end < &je->ends[je->ends_next];
		     end++)
			WT_ERR(__curjoin_endpoint_init_key(session, je, end));

		/*
		 * Do any needed Bloom filter initialization.  Ignore Bloom
		 * filters for entries that will be iterated.  They won't
		 * help since these entries either don't need an inclusion
		 * check or are doing any needed check during the iteration.
		 */
		if (!iterable && F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) {
			if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
			       WT_ERR_MSG(session, EINVAL,
				    "join cursors with Bloom filters cannot be "
				    "used with read-uncommitted isolation");
			if (je->bloom == NULL) {
				/*
				 * Look for compatible filters to be shared,
				 * pick compatible numbers for bit counts
				 * and number of hashes.
				 */
				f = je->bloom_bit_count;
				k = je->bloom_hash_count;
				for (je2 = je + 1; je2 < jeend; je2++)
					if (F_ISSET(je2,
					    WT_CURJOIN_ENTRY_BLOOM) &&
					    je2->count == je->count) {
						f = WT_MAX(
						    je2->bloom_bit_count, f);
						k = WT_MAX(
						    je2->bloom_hash_count, k);
					}
				je->bloom_bit_count = f;
				je->bloom_hash_count = k;
				WT_ERR(__wt_bloom_create(session, NULL,
				    NULL, je->count, f, k, &je->bloom));
				F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM);
				WT_ERR(__curjoin_init_bloom(session, cjoin,
				    je, je->bloom));
				/*
				 * Share the Bloom filter, making all
				 * config info consistent.
				 */
				for (je2 = je + 1; je2 < jeend; je2++)
					if (F_ISSET(je2,
					    WT_CURJOIN_ENTRY_BLOOM) &&
					    je2->count == je->count) {
						WT_ASSERT(session,
						    je2->bloom == NULL);
						je2->bloom = je->bloom;
						je2->bloom_bit_count = f;
						je2->bloom_hash_count = k;
					}
			} else {
				/*
				 * Create a temporary filter that we'll
				 * merge into the shared one.  The Bloom
				 * parameters of the two filters must match.
				 */
				WT_ERR(__wt_bloom_create(session, NULL,
				    NULL, je->count, je->bloom_bit_count,
				    je->bloom_hash_count, &bloom));
				WT_ERR(__curjoin_init_bloom(session, cjoin,
				    je, bloom));
				WT_ERR(__wt_bloom_intersection(je->bloom,
				    bloom));
				WT_ERR(__wt_bloom_close(bloom));
			}
		}
		if (!F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION))
			iterable = false;
	}
	F_SET(cjoin, WT_CURJOIN_INITIALIZED);

err:	__wt_free(session, mainbuf);
	return (ret);
}
Exemple #29
0
/*
 * __rec_page_dirty_update --
 *	Update a dirty page's reference on eviction.
 */
static int
__rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_ADDR *addr;
	WT_PAGE_MODIFY *mod;
	WT_REF *parent_ref;

	mod = page->modify;
	parent_ref = page->ref;

	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
	case WT_PM_REC_EMPTY:				/* Page is empty */
		if (parent_ref->addr != NULL &&
		    __wt_off_page(page->parent, parent_ref->addr)) {
			__wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr);
			__wt_free(session, parent_ref->addr);
		}

		/*
		 * Update the parent to reference an empty page.
		 *
		 * Set the transaction ID to WT_TXN_NONE because the fact that
		 * reconciliation left the page "empty" means there's no older
		 * transaction in the system that might need to see an earlier
		 * version of the page.  It isn't necessary (WT_TXN_NONE is 0),
		 * but it's the right thing to do.
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		parent_ref->page = NULL;
		parent_ref->addr = NULL;
		parent_ref->txnid = WT_TXN_NONE;
		WT_PUBLISH(parent_ref->state, WT_REF_DELETED);
		break;
	case WT_PM_REC_REPLACE: 			/* 1-for-1 page swap */
		if (parent_ref->addr != NULL &&
		    __wt_off_page(page->parent, parent_ref->addr)) {
			__wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr);
			__wt_free(session, parent_ref->addr);
		}

		/*
		 * Update the parent to reference the replacement page.
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
		*addr = mod->u.replace;
		mod->u.replace.addr = NULL;
		mod->u.replace.size = 0;

		parent_ref->page = NULL;
		parent_ref->addr = addr;
		WT_PUBLISH(parent_ref->state, WT_REF_DISK);
		break;
	case WT_PM_REC_SPLIT:				/* Page split */
		/*
		 * Update the parent to reference new internal page(s).
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		parent_ref->page = mod->u.split;
		WT_PUBLISH(parent_ref->state, WT_REF_MEM);

		/* Clear the reference else discarding the page will free it. */
		mod->u.split = NULL;
		F_CLR(mod, WT_PM_REC_SPLIT);
		break;
	WT_ILLEGAL_VALUE(session);
	}

	return (0);
}
Exemple #30
0
/*
 * __curfile_create --
 *	Open a cursor for a given btree handle.
 */
static int
__curfile_create(WT_SESSION_IMPL *session,
    WT_CURSOR *owner, const char *cfg[], bool bulk, bool bitmap,
    WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,		/* get-key */
	    __wt_cursor_get_value,		/* get-value */
	    __wt_cursor_set_key,		/* set-key */
	    __wt_cursor_set_value,		/* set-value */
	    __curfile_compare,			/* compare */
	    __curfile_equals,			/* equals */
	    __curfile_next,			/* next */
	    __curfile_prev,			/* prev */
	    __curfile_reset,			/* reset */
	    __curfile_search,			/* search */
	    __curfile_search_near,		/* search-near */
	    __curfile_insert,			/* insert */
	    __wt_cursor_modify_notsup,		/* modify */
	    __curfile_update,			/* update */
	    __curfile_remove,			/* remove */
	    __curfile_reserve,			/* reserve */
	    __wt_cursor_reconfigure,		/* reconfigure */
	    __curfile_close);			/* close */
	WT_BTREE *btree;
	WT_CONFIG_ITEM cval;
	WT_CURSOR *cursor;
	WT_CURSOR_BTREE *cbt;
	WT_CURSOR_BULK *cbulk;
	WT_DECL_RET;
	size_t csize;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0);

	cbt = NULL;

	btree = S2BT(session);
	WT_ASSERT(session, btree != NULL);

	csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE);
	WT_RET(__wt_calloc(session, 1, csize, &cbt));

	cursor = &cbt->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	cursor->internal_uri = btree->dhandle->name;
	cursor->key_format = btree->key_format;
	cursor->value_format = btree->value_format;
	cbt->btree = btree;

	/*
	 * Increment the data-source's in-use counter; done now because closing
	 * the cursor will decrement it, and all failure paths from here close
	 * the cursor.
	 */
	__wt_cursor_dhandle_incr_use(session);

	if (session->dhandle->checkpoint != NULL)
		F_SET(cbt, WT_CBT_NO_TXN);

	if (bulk) {
		F_SET(cursor, WT_CURSTD_BULK);

		cbulk = (WT_CURSOR_BULK *)cbt;

		/* Optionally skip the validation of each bulk-loaded key. */
		WT_ERR(__wt_config_gets_def(
		    session, cfg, "skip_sort_check", 0, &cval));
		WT_ERR(__wt_curbulk_init(
		    session, cbulk, bitmap, cval.val == 0 ? 0 : 1));
	}

	/*
	 * Random retrieval, row-store only.
	 * Random retrieval cursors support a limited set of methods.
	 */
	WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
	if (cval.val != 0) {
		if (WT_CURSOR_RECNO(cursor))
			WT_ERR_MSG(session, ENOTSUP,
			    "next_random configuration not supported for "
			    "column-store objects");

		__wt_cursor_set_notsup(cursor);
		cursor->next = __wt_curfile_next_random;
		cursor->reset = __curfile_reset;

		WT_ERR(__wt_config_gets_def(
		    session, cfg, "next_random_sample_size", 0, &cval));
		if (cval.val != 0)
			cbt->next_random_sample_size = (u_int)cval.val;
	}

	/* Underlying btree initialization. */
	__wt_btcur_open(cbt);

	/*
	 * WT_CURSOR.modify supported on 'u' value formats, but the fast-path
	 * through the btree code requires log file format changes, it's not
	 * available in all versions.
	 */
	if (WT_STREQ(cursor->value_format, "u") &&
	    S2C(session)->compat_major >= WT_LOG_V2)
		cursor->modify = __curfile_modify;

	WT_ERR(__wt_cursor_init(
	    cursor, cursor->internal_uri, owner, cfg, cursorp));

	WT_STAT_CONN_INCR(session, cursor_create);
	WT_STAT_DATA_INCR(session, cursor_create);

	if (0) {
err:		/*
		 * Our caller expects to release the data handle if we fail.
		 * Disconnect it from the cursor before closing.
		 */
		if (session->dhandle != NULL)
			__wt_cursor_dhandle_decr_use(session);
		cbt->btree = NULL;
		WT_TRET(__curfile_close(cursor));
		*cursorp = NULL;
	}

	return (ret);
}