Esempio n. 1
0
/*
 * __wt_realloc_aligned --
 *	ANSI realloc function that aligns to buffer boundaries, configured with
 *	the "buffer_alignment" key to wiredtiger_open.
 */
int
__wt_realloc_aligned(WT_SESSION_IMPL *session,
    size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
{
#if defined(HAVE_POSIX_MEMALIGN)
	WT_DECL_RET;

	/*
	 * !!!
	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
	 */
	if (session != NULL && S2C(session)->buffer_alignment > 0) {
		void *p, *newp;
		size_t bytes_allocated;

		/*
		 * Sometimes we're allocating memory and we don't care about the
		 * final length -- bytes_allocated_ret may be NULL.
		 */
		p = *(void **)retp;
		bytes_allocated =
		    (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret;
		WT_ASSERT(session,
		    (p == NULL && bytes_allocated == 0) ||
		    (p != NULL &&
		    (bytes_allocated_ret == NULL || bytes_allocated != 0)));
		WT_ASSERT(session, bytes_to_allocate != 0);
		WT_ASSERT(session, bytes_allocated < bytes_to_allocate);

		/*
		 * We are going to allocate an aligned buffer.  When we do this
		 * repeatedly, the allocator is expected to start on a boundary
		 * each time, account for that additional space by never asking
		 * for less than a full alignment size.  The primary use case
		 * for aligned buffers is Linux direct I/O, which requires that
		 * the size be a multiple of the alignment anyway.
		 */
		bytes_to_allocate =
		    WT_ALIGN(bytes_to_allocate, S2C(session)->buffer_alignment);

		WT_STAT_FAST_CONN_INCR(session, memory_allocation);

		if ((ret = posix_memalign(&newp,
		    S2C(session)->buffer_alignment,
		    bytes_to_allocate)) != 0)
			WT_RET_MSG(session, ret, "memory allocation");

		if (p != NULL)
			memcpy(newp, p, bytes_allocated);
		__wt_free(session, p);
		p = newp;

		/* Clear the allocated memory (see above). */
		memset((uint8_t *)p + bytes_allocated, 0,
		    bytes_to_allocate - bytes_allocated);

		/* Update caller's bytes allocated value. */
		if (bytes_allocated_ret != NULL)
			*bytes_allocated_ret = bytes_to_allocate;

		*(void **)retp = p;
		return (0);
	}
#endif
	/*
	 * If there is no posix_memalign function, or no alignment configured,
	 * fall back to realloc.
	 *
	 * Windows note: Visual C CRT memalign does not match Posix behavior
	 * and would also double each allocation so it is bad for memory use
	 */
	return (__wt_realloc(
	    session, bytes_allocated_ret, bytes_to_allocate, retp));
}
Esempio n. 2
0
/*
 * __wt_lsm_meta_read --
 *	Read the metadata for an LSM tree.
 */
int
__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_CONFIG cparser, lparser;
	WT_CONFIG_ITEM ck, cv, fileconf, lk, lv, metadata;
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	char *lsmconfig;
	u_int nchunks;

	chunk = NULL;			/* -Wconditional-uninitialized */

	WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconfig));
	WT_ERR(__wt_config_init(session, &cparser, lsmconfig));
	while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) {
		if (WT_STRING_MATCH("key_format", ck.str, ck.len)) {
			__wt_free(session, lsm_tree->key_format);
			WT_ERR(__wt_strndup(session,
			    cv.str, cv.len, &lsm_tree->key_format));
		} else if (WT_STRING_MATCH("value_format", ck.str, ck.len)) {
			__wt_free(session, lsm_tree->value_format);
			WT_ERR(__wt_strndup(session,
			    cv.str, cv.len, &lsm_tree->value_format));
		} else if (WT_STRING_MATCH("collator", ck.str, ck.len)) {
			if (cv.len == 0 ||
			    WT_STRING_CASE_MATCH("none", cv.str, cv.len))
				continue;
			/*
			 * Extract the application-supplied metadata (if any)
			 * from the file configuration.
			 */
			WT_ERR(__wt_config_getones(
			    session, lsmconfig, "file_config", &fileconf));
			WT_CLEAR(metadata);
			WT_ERR_NOTFOUND_OK(__wt_config_subgets(
			    session, &fileconf, "app_metadata", &metadata));
			WT_ERR(__wt_collator_config(session, lsm_tree->name,
			    &cv, &metadata,
			    &lsm_tree->collator, &lsm_tree->collator_owned));
			WT_ERR(__wt_strndup(session,
			    cv.str, cv.len, &lsm_tree->collator_name));
		} else if (WT_STRING_MATCH("bloom_config", ck.str, ck.len)) {
			__wt_free(session, lsm_tree->bloom_config);
			/* Don't include the brackets. */
			WT_ERR(__wt_strndup(session,
			    cv.str + 1, cv.len - 2, &lsm_tree->bloom_config));
		} else if (WT_STRING_MATCH("file_config", ck.str, ck.len)) {
			__wt_free(session, lsm_tree->file_config);
			/* Don't include the brackets. */
			WT_ERR(__wt_strndup(session,
			    cv.str + 1, cv.len - 2, &lsm_tree->file_config));
		} else if (WT_STRING_MATCH("auto_throttle", ck.str, ck.len)) {
			if (cv.val)
				F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
			else
				F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE);
		} else if (WT_STRING_MATCH("bloom", ck.str, ck.len))
			lsm_tree->bloom = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("bloom_bit_count", ck.str, ck.len))
			lsm_tree->bloom_bit_count = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("bloom_hash_count", ck.str, ck.len))
			lsm_tree->bloom_hash_count = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("chunk_max", ck.str, ck.len))
			lsm_tree->chunk_max = (uint64_t)cv.val;
		else if (WT_STRING_MATCH("chunk_size", ck.str, ck.len))
			lsm_tree->chunk_size = (uint64_t)cv.val;
		else if (WT_STRING_MATCH("merge_max", ck.str, ck.len))
			lsm_tree->merge_max = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("merge_min", ck.str, ck.len))
			lsm_tree->merge_min = (uint32_t)cv.val;
		else if (WT_STRING_MATCH("last", ck.str, ck.len))
			lsm_tree->last = (u_int)cv.val;
		else if (WT_STRING_MATCH("chunks", ck.str, ck.len)) {
			WT_ERR(__wt_config_subinit(session, &lparser, &cv));
			for (nchunks = 0; (ret =
			    __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
				if (WT_STRING_MATCH("id", lk.str, lk.len)) {
					WT_ERR(__wt_realloc_def(session,
					    &lsm_tree->chunk_alloc,
					    nchunks + 1, &lsm_tree->chunk));
					WT_ERR(
					    __wt_calloc_one(session, &chunk));
					lsm_tree->chunk[nchunks++] = chunk;
					chunk->id = (uint32_t)lv.val;
					WT_ERR(__wt_lsm_tree_chunk_name(session,
					    lsm_tree, chunk->id, &chunk->uri));
					F_SET(chunk,
					    WT_LSM_CHUNK_ONDISK |
					    WT_LSM_CHUNK_STABLE);
				} else if (WT_STRING_MATCH(
				    "bloom", lk.str, lk.len)) {
					WT_ERR(__wt_lsm_tree_bloom_name(
					    session, lsm_tree,
					    chunk->id, &chunk->bloom_uri));
					F_SET(chunk, WT_LSM_CHUNK_BLOOM);
					continue;
				} else if (WT_STRING_MATCH(
				    "chunk_size", lk.str, lk.len)) {
					chunk->size = (uint64_t)lv.val;
					continue;
				} else if (WT_STRING_MATCH(
				    "count", lk.str, lk.len)) {
					chunk->count = (uint64_t)lv.val;
					continue;
				} else if (WT_STRING_MATCH(
				    "generation", lk.str, lk.len)) {
					chunk->generation = (uint32_t)lv.val;
					continue;
				}
			}
			WT_ERR_NOTFOUND_OK(ret);
			lsm_tree->nchunks = nchunks;
		} else if (WT_STRING_MATCH("old_chunks", ck.str, ck.len)) {
			WT_ERR(__wt_config_subinit(session, &lparser, &cv));
			for (nchunks = 0; (ret =
			    __wt_config_next(&lparser, &lk, &lv)) == 0; ) {
				if (WT_STRING_MATCH("bloom", lk.str, lk.len)) {
					WT_ERR(__wt_strndup(session,
					    lv.str, lv.len, &chunk->bloom_uri));
					F_SET(chunk, WT_LSM_CHUNK_BLOOM);
					continue;
				}
				WT_ERR(__wt_realloc_def(session,
				    &lsm_tree->old_alloc, nchunks + 1,
				    &lsm_tree->old_chunks));
				WT_ERR(__wt_calloc_one(session, &chunk));
				lsm_tree->old_chunks[nchunks++] = chunk;
				WT_ERR(__wt_strndup(session,
				    lk.str, lk.len, &chunk->uri));
				F_SET(chunk, WT_LSM_CHUNK_ONDISK);
			}
			WT_ERR_NOTFOUND_OK(ret);
			lsm_tree->nold_chunks = nchunks;
		/* Values included for backward compatibility */
		} else if (WT_STRING_MATCH("merge_threads", ck.str, ck.len)) {
		} else
			WT_ERR(__wt_illegal_value(session, "LSM metadata"));
	}
	WT_ERR_NOTFOUND_OK(ret);

	/*
	 * If the default merge_min was not overridden, calculate it now.  We
	 * do this here so that trees created before merge_min was added get a
	 * sane value.
	 */
	if (lsm_tree->merge_min < 2)
		lsm_tree->merge_min = WT_MAX(2, lsm_tree->merge_max / 2);

err:	__wt_free(session, lsmconfig);
	return (ret);
}
Esempio n. 3
0
/*
 * __wt_lsm_merge --
 *	Merge a set of chunks of an LSM tree.
 */
int
__wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
{
	WT_BLOOM *bloom;
	WT_CURSOR *dest, *src;
	WT_DECL_RET;
	WT_ITEM key, value;
	WT_LSM_CHUNK *chunk;
	uint32_t generation;
	uint64_t insert_count, record_count;
	u_int dest_id, end_chunk, i, nchunks, start_chunk, start_id, verb;
	int tret;
	bool created_chunk, create_bloom, locked, in_sync;
	const char *cfg[3];
	const char *drop_cfg[] =
	    { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL };

	bloom = NULL;
	chunk = NULL;
	dest = src = NULL;
	created_chunk = create_bloom = locked = in_sync = false;

	/* Fast path if it's obvious no merges could be done. */
	if (lsm_tree->nchunks < lsm_tree->merge_min &&
	    lsm_tree->merge_aggressiveness < WT_LSM_AGGRESSIVE_THRESHOLD)
		return (WT_NOTFOUND);

	/*
	 * Use the lsm_tree lock to read the chunks (so no switches occur), but
	 * avoid holding it while the merge is in progress: that may take a
	 * long time.
	 */
	__wt_lsm_tree_writelock(session, lsm_tree);
	locked = true;

	WT_ERR(__lsm_merge_span(session,
	    lsm_tree, id, &start_chunk, &end_chunk, &record_count));
	nchunks = (end_chunk + 1) - start_chunk;

	WT_ASSERT(session, nchunks > 0);
	start_id = lsm_tree->chunk[start_chunk]->id;

	/* Find the merge generation. */
	for (generation = 0, i = 0; i < nchunks; i++)
		generation = WT_MAX(generation,
		    lsm_tree->chunk[start_chunk + i]->generation + 1);

	__wt_lsm_tree_writeunlock(session, lsm_tree);
	locked = false;

	/* Allocate an ID for the merge. */
	dest_id = __wt_atomic_add32(&lsm_tree->last, 1);

	/*
	 * We only want to do the chunk loop if we're running with verbose,
	 * so we wrap these statements in the conditional.  Avoid the loop
	 * in the normal path.
	 */
	if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) {
		__wt_verbose(session, WT_VERB_LSM,
		    "Merging %s chunks %u-%u into %u (%" PRIu64 " records)"
		    ", generation %" PRIu32,
		    lsm_tree->name,
		    start_chunk, end_chunk, dest_id, record_count, generation);
		for (verb = start_chunk; verb < end_chunk + 1; verb++)
			__wt_verbose(session, WT_VERB_LSM,
			    "Merging %s: Chunk[%u] id %" PRIu32
			    ", gen: %" PRIu32
			    ", size: %" PRIu64 ", records: %" PRIu64,
			    lsm_tree->name, verb, lsm_tree->chunk[verb]->id,
			    lsm_tree->chunk[verb]->generation,
			    lsm_tree->chunk[verb]->size,
			    lsm_tree->chunk[verb]->count);
	}

	WT_ERR(__wt_calloc_one(session, &chunk));
	created_chunk = true;
	chunk->id = dest_id;

	if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) &&
	    (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) ||
	    start_chunk > 0) && record_count > 0)
		create_bloom = true;

	/*
	 * Special setup for the merge cursor:
	 * first, reset to open the dependent cursors;
	 * then restrict the cursor to a specific number of chunks;
	 * then set MERGE so the cursor doesn't track updates to the tree.
	 */
	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
	F_SET(src, WT_CURSTD_RAW);
	WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks));

	WT_WITH_SCHEMA_LOCK(session, ret,
	    ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
	WT_ERR(ret);
	if (create_bloom) {
		WT_ERR(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk));

		WT_ERR(__wt_bloom_create(session, chunk->bloom_uri,
		    lsm_tree->bloom_config,
		    record_count, lsm_tree->bloom_bit_count,
		    lsm_tree->bloom_hash_count, &bloom));
	}

	/* Discard pages we read as soon as we're done with them. */
	F_SET(session, WT_SESSION_NO_CACHE);

	cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor);
	cfg[1] = "bulk,raw,skip_sort_check";
	cfg[2] = NULL;
	WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));

#define	LSM_MERGE_CHECK_INTERVAL	WT_THOUSAND
	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
		if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) {
			if (!lsm_tree->active)
				WT_ERR(EINTR);

			WT_STAT_CONN_INCRV(session,
			    lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL);
			++lsm_tree->merge_progressing;
		}

		WT_ERR(src->get_key(src, &key));
		dest->set_key(dest, &key);
		WT_ERR(src->get_value(src, &value));
		dest->set_value(dest, &value);
		WT_ERR(dest->insert(dest));
		if (create_bloom)
			__wt_bloom_insert(bloom, &key);
	}
	WT_ERR_NOTFOUND_OK(ret);

	WT_STAT_CONN_INCRV(session,
	    lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL);
	++lsm_tree->merge_progressing;
	__wt_verbose(session, WT_VERB_LSM,
	    "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted",
	    record_count, insert_count);

	/*
	 * Closing and syncing the files can take a while.  Set the
	 * merge_syncing field so that compact knows it is still in
	 * progress.
	 */
	(void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1);
	in_sync = true;
	/*
	 * We've successfully created the new chunk.  Now install it.  We need
	 * to ensure that the NO_CACHE flag is cleared and the bloom filter
	 * is closed (even if a step fails), so track errors but don't return
	 * until we've cleaned up.
	 */
	WT_TRET(src->close(src));
	WT_TRET(dest->close(dest));
	src = dest = NULL;

	F_CLR(session, WT_SESSION_NO_CACHE);

	/*
	 * We're doing advisory reads to fault the new trees into cache.
	 * Don't block if the cache is full: our next unit of work may be to
	 * discard some trees to free space.
	 */
	F_SET(session, WT_SESSION_NO_EVICTION);

	if (create_bloom) {
		if (ret == 0)
			WT_TRET(__wt_bloom_finalize(bloom));

		/*
		 * Read in a key to make sure the Bloom filters btree handle is
		 * open before it becomes visible to application threads.
		 * Otherwise application threads will stall while it is opened
		 * and internal pages are read into cache.
		 */
		if (ret == 0) {
			WT_CLEAR(key);
			WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
		}

		WT_TRET(__wt_bloom_close(bloom));
		bloom = NULL;
	}
	WT_ERR(ret);

	/*
	 * Open a handle on the new chunk before application threads attempt
	 * to access it, opening it pre-loads internal pages into the file
	 * system cache.
	 */
	cfg[1] = "checkpoint=" WT_CHECKPOINT;
	WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
	WT_TRET(dest->close(dest));
	dest = NULL;
	++lsm_tree->merge_progressing;
	(void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1);
	in_sync = false;
	WT_ERR_NOTFOUND_OK(ret);

	WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
	__wt_lsm_tree_writelock(session, lsm_tree);
	locked = true;

	/*
	 * Check whether we raced with another merge, and adjust the chunk
	 * array offset as necessary.
	 */
	if (start_chunk >= lsm_tree->nchunks ||
	    lsm_tree->chunk[start_chunk]->id != start_id)
		for (start_chunk = 0;
		    start_chunk < lsm_tree->nchunks;
		    start_chunk++)
			if (lsm_tree->chunk[start_chunk]->id == start_id)
				break;

	/*
	 * It is safe to error out here - since the update can only fail
	 * prior to making updates to the tree.
	 */
	WT_ERR(__wt_lsm_merge_update_tree(
	    session, lsm_tree, start_chunk, nchunks, chunk));

	if (create_bloom)
		F_SET(chunk, WT_LSM_CHUNK_BLOOM);
	chunk->count = insert_count;
	chunk->generation = generation;
	F_SET(chunk, WT_LSM_CHUNK_ONDISK);

	/*
	 * We have no current way of continuing if the metadata update fails,
	 * so we will panic in that case.  Put some effort into cleaning up
	 * after ourselves here - so things have a chance of shutting down.
	 *
	 * Any errors that happened after the tree was locked are
	 * fatal - we can't guarantee the state of the tree.
	 */
	if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0)
		WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge");

	lsm_tree->dsk_gen++;

	/* Update the throttling while holding the tree lock. */
	__wt_lsm_tree_throttle(session, lsm_tree, true);

	/* Schedule a pass to discard old chunks */
	WT_ERR(__wt_lsm_manager_push_entry(
	    session, WT_LSM_WORK_DROP, 0, lsm_tree));

err:	if (locked)
		__wt_lsm_tree_writeunlock(session, lsm_tree);
	if (in_sync)
		(void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1);
	if (src != NULL)
		WT_TRET(src->close(src));
	if (dest != NULL)
		WT_TRET(dest->close(dest));
	if (bloom != NULL)
		WT_TRET(__wt_bloom_close(bloom));
	if (ret != 0 && created_chunk) {
		/* Drop the newly-created files on error. */
		if (chunk->uri != NULL) {
			WT_WITH_SCHEMA_LOCK(session, tret,
			    tret = __wt_schema_drop(
			    session, chunk->uri, drop_cfg));
			WT_TRET(tret);
		}
		if (create_bloom && chunk->bloom_uri != NULL) {
			WT_WITH_SCHEMA_LOCK(session, tret,
			    tret = __wt_schema_drop(
			    session, chunk->bloom_uri, drop_cfg));
			WT_TRET(tret);
		}
		__wt_free(session, chunk->bloom_uri);
		__wt_free(session, chunk->uri);
		__wt_free(session, chunk);

		if (ret == EINTR)
			__wt_verbose(session, WT_VERB_LSM,
			    "Merge aborted due to close");
		else
			__wt_verbose(session, WT_VERB_LSM,
			    "Merge failed with %s",
			   __wt_strerror(session, ret, NULL, 0));
	}
	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
	return (ret);
}
Esempio n. 4
0
/*
 * __wt_page_out --
 *	Discard an in-memory page, freeing all memory associated with it.
 */
void
__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
{
	WT_PAGE *page;

	/*
	 * When a page is discarded, it's been disconnected from its parent and
	 * its parent's WT_REF structure may now point to a different page.
	 * Make sure we don't accidentally use the page itself or any other
	 * information.
	 */
	page = *pagep;
	*pagep = NULL;
	page->parent = NULL;
	page->ref = NULL;

	WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));

#ifdef HAVE_DIAGNOSTIC
	{
	WT_HAZARD *hp;
	if ((hp = __wt_page_hazard_check(session, page)) != NULL)
		__wt_errx(session,
		    "discarded page has hazard pointer: (%p: %s, line %d)",
		    hp->page, hp->file, hp->line);
	}
#endif
	/* Update the cache's information. */
	__wt_cache_page_evict(session, page);

	/* Free the page modification information. */
	if (page->modify != NULL)
		__free_page_modify(session, page);

	switch (page->type) {
	case WT_PAGE_COL_FIX:
		break;
	case WT_PAGE_COL_INT:
		__free_page_col_int(session, page);
		break;
	case WT_PAGE_COL_VAR:
		__free_page_col_var(session, page);
		break;
	case WT_PAGE_ROW_INT:
		__free_page_row_int(session, page);
		break;
	case WT_PAGE_ROW_LEAF:
		__free_page_row_leaf(session, page);
		break;
	}

	/* Free any allocated disk image. */
	if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_NOT_ALLOC)) {
		if (page->dsk != NULL)
			__wt_mmap_discard(
			    session, page->dsk, page->dsk->mem_size);
	} else
		__wt_free(session, page->dsk);

	__wt_overwrite_and_free(session, page);
}
Esempio n. 5
0
/*
 * __conn_dhandle_get --
 *	Allocate a new data handle, lock it exclusively, and return it linked
 *	into the connection's list.
 */
static int
__conn_dhandle_get(WT_SESSION_IMPL *session,
    const char *name, const char *ckpt, uint32_t flags)
{
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_DATA_HANDLE *dhandle;
	WT_DECL_RET;
	uint32_t bucket;

	conn = S2C(session);

	/*
	 * We have the handle lock, check whether we can find the handle we
	 * are looking for.  If we do, and we can lock it in the state we
	 * want, this session will take ownership and we are done.
	 */
	ret = __wt_conn_dhandle_find(session, name, ckpt, flags);
	if (ret == 0) {
		dhandle = session->dhandle;
		WT_RET(__conn_dhandle_open_lock(session, dhandle, flags));
		return (0);
	}
	WT_RET_NOTFOUND_OK(ret);

	/*
	 * If no handle was found, allocate the data handle and a btree handle,
	 * then initialize the data handle.  Exclusively lock the data handle
	 * before inserting it in the list.
	 */
	WT_RET(__wt_calloc_one(session, &dhandle));

	WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle"));

	dhandle->name_hash = __wt_hash_city64(name, strlen(name));
	WT_ERR(__wt_strdup(session, name, &dhandle->name));
	if (ckpt != NULL)
		WT_ERR(__wt_strdup(session, ckpt, &dhandle->checkpoint));

	WT_ERR(__wt_calloc_one(session, &btree));
	dhandle->handle = btree;
	btree->dhandle = dhandle;

	WT_ERR(__wt_spin_init(
	    session, &dhandle->close_lock, "data handle close"));

	F_SET(dhandle, WT_DHANDLE_EXCLUSIVE);
	WT_ERR(__wt_writelock(session, dhandle->rwlock));

	/*
	 * Prepend the handle to the connection list, assuming we're likely to
	 * need new files again soon, until they are cached by all sessions.
	 * Find the right hash bucket to insert into as well.
	 */
	WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED));
	bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE;
	WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket);

	session->dhandle = dhandle;
	return (0);

err:	WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock));
	__wt_free(session, dhandle->name);
	__wt_free(session, dhandle->checkpoint);
	__wt_free(session, dhandle->handle);		/* btree free */
	__wt_spin_destroy(session, &dhandle->close_lock);
	__wt_overwrite_and_free(session, dhandle);

	return (ret);
}
Esempio n. 6
0
/*
 * __create_file --
 *	Create a new 'file:' object.
 */
static int
__create_file(WT_SESSION_IMPL *session,
    const char *uri, bool exclusive, const char *config)
{
	WT_DECL_ITEM(val);
	WT_DECL_RET;
	const char *filename, **p, *filecfg[] =
	    { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL };
	char *fileconf;
	uint32_t allocsize;
	bool is_metadata;

	fileconf = NULL;

	is_metadata = strcmp(uri, WT_METAFILE_URI) == 0;

	filename = uri;
	if (!WT_PREFIX_SKIP(filename, "file:"))
		return (__wt_unexpected_object_type(session, uri, "file:"));

	/* Check if the file already exists. */
	if (!is_metadata && (ret =
	    __wt_metadata_search(session, uri, &fileconf)) != WT_NOTFOUND) {
		if (exclusive)
			WT_TRET(EEXIST);
		goto err;
	}

	/* Sanity check the allocation size. */
	WT_ERR(__wt_direct_io_size_check(
	    session, filecfg, "allocation_size", &allocsize));

	/* Create the file. */
	WT_ERR(__wt_block_manager_create(session, filename, allocsize));
	if (WT_META_TRACKING(session))
		WT_ERR(__wt_meta_track_fileop(session, NULL, uri));

	/*
	 * If creating an ordinary file, append the file ID and current version
	 * numbers to the passed-in configuration and insert the resulting
	 * configuration into the metadata.
	 */
	if (!is_metadata) {
		WT_ERR(__wt_scr_alloc(session, 0, &val));
		WT_ERR(__wt_buf_fmt(session, val,
		    "id=%" PRIu32 ",version=(major=%d,minor=%d)",
		    ++S2C(session)->next_file_id,
		    WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX));
		for (p = filecfg; *p != NULL; ++p)
			;
		*p = val->data;
		WT_ERR(__wt_config_collapse(session, filecfg, &fileconf));
		WT_ERR(__wt_metadata_insert(session, uri, fileconf));
	}

	/*
	 * Open the file to check that it was setup correctly. We don't need to
	 * pass the configuration, we just wrote the collapsed configuration
	 * into the metadata file, and it's going to be read/used by underlying
	 * functions.
	 *
	 * Keep the handle exclusive until it is released at the end of the
	 * call, otherwise we could race with a drop.
	 */
	WT_ERR(__wt_session_get_btree(
	    session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE));
	if (WT_META_TRACKING(session))
		WT_ERR(__wt_meta_track_handle_lock(session, true));
	else
		WT_ERR(__wt_session_release_btree(session));

err:	__wt_scr_free(session, &val);
	__wt_free(session, fileconf);
	return (ret);
}
Esempio n. 7
0
/*
 * __ovfl_reuse_wrapup --
 *	Resolve the page's overflow reuse list after a page is written.
 */
static int
__ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_BM *bm;
	WT_OVFL_REUSE **e, **head, *reuse;
	size_t decr;
	int i;

	bm = S2BT(session)->bm;
	head = page->modify->ovfl_track->ovfl_reuse;

	/*
	 * Discard any overflow records that aren't in-use, freeing underlying
	 * blocks.
	 *
	 * First, walk the overflow reuse lists (except for the lowest one),
	 * fixing up skiplist links.
	 */
	for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
		for (e = &head[i]; (reuse = *e) != NULL;) {
			if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) {
				e = &reuse->next[i];
				continue;
			}
			*e = reuse->next[i];
		}

	/*
	 * Second, discard any overflow record without an in-use flag, clear
	 * the flags for the next run.
	 *
	 * As part of the pass through the lowest level, figure out how much
	 * space we added/subtracted from the page, and update its footprint.
	 * We don't get it exactly correct because we don't know the depth of
	 * the skiplist here, but it's close enough, and figuring out the
	 * memory footprint change in the reconciliation wrapup code means
	 * fewer atomic updates and less code overall.
	 */
	decr = 0;
	for (e = &head[0]; (reuse = *e) != NULL;) {
		if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) {
			F_CLR(reuse,
			    WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED);
			e = &reuse->next[0];
			continue;
		}
		*e = reuse->next[0];

		WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED));

		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
			WT_RET(
			    __ovfl_reuse_verbose(session, page, reuse, "free"));

		WT_RET(bm->free(
		    bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size));
		decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE);
		__wt_free(session, reuse);
	}

	if (decr != 0)
		__wt_cache_page_inmem_decr(session, page, decr);
	return (0);
}
Esempio n. 8
0
static int
__meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll)
{
	WT_BTREE *saved_btree;
	WT_DECL_RET;
	int tret;

	/*
	 * Unlock handles and complete checkpoints regardless of whether we are
	 * unrolling.
	 */
	if (!unroll && trk->op != WT_ST_CHECKPOINT && trk->op != WT_ST_LOCK)
		goto free;

	switch (trk->op) {
	case WT_ST_EMPTY:	/* Unused slot */
		break;
	case WT_ST_CHECKPOINT:	/* Checkpoint, see above */
		saved_btree = session->btree;
		session->btree = trk->btree;
		if (!unroll)
			WT_TRET(__wt_bm_checkpoint_resolve(session));
		session->btree = saved_btree;
		break;
	case WT_ST_LOCK:	/* Handle lock, see above */
		saved_btree = session->btree;
		session->btree = trk->btree;
		if (unroll && trk->created)
			F_SET(session->btree, WT_BTREE_DISCARD);
		WT_TRET(__wt_session_release_btree(session));
		session->btree = saved_btree;
		break;
	case WT_ST_FILEOP:	/* File operation */
		/*
		 * For renames, both a and b are set.
		 * For creates, a is NULL.
		 * For removes, b is NULL.
		 */
		if (trk->a != NULL && trk->b != NULL &&
		    (tret = __wt_rename(session,
		    trk->b + strlen("file:"),
		    trk->a + strlen("file:"))) != 0) {
			__wt_err(session, tret,
			    "metadata unroll rename %s to %s",
			    trk->b, trk->a);
			WT_TRET(tret);
		} else if (trk->a == NULL) {
			if ((tret = __wt_remove(session,
			    trk->b + strlen("file:"))) != 0) {
				__wt_err(session, tret,
				    "metadata unroll create %s",
				    trk->b);
				WT_TRET(tret);
			}
		}
		/*
		 * We can't undo removes yet: that would imply
		 * some kind of temporary rename and remove in
		 * roll forward.
		 */
		break;
	case WT_ST_REMOVE:	/* Remove trk.a */
		if ((tret = __wt_metadata_remove(
		    session, trk->a)) != 0) {
			__wt_err(session, ret,
			    "metadata unroll remove: %s",
			    trk->a);
			WT_TRET(tret);
		}
		break;
	case WT_ST_SET:		/* Set trk.a to trk.b */
		if ((tret = __wt_metadata_update(
		    session, trk->a, trk->b)) != 0) {
			__wt_err(session, ret,
			    "metadata unroll update %s to %s",
			    trk->a, trk->b);
			WT_TRET(tret);
		}
		break;
	WT_ILLEGAL_VALUE(session);
	}

free:	trk->op = WT_ST_EMPTY;
	__wt_free(session, trk->a);
	__wt_free(session, trk->b);
	trk->btree = NULL;

	return (ret);
}
Esempio n. 9
0
/*
 * __wt_delete_page_rollback --
 *	Abort pages that were deleted without being instantiated.
 */
void
__wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_UPDATE **upd;
	uint64_t sleep_count, yield_count;

	/*
	 * If the page is still "deleted", it's as we left it, reset the state
	 * to on-disk and we're done.  Otherwise, we expect the page is either
	 * instantiated or being instantiated.  Loop because it's possible for
	 * the page to return to the deleted state if instantiation fails.
	 */
	for (sleep_count = yield_count = 0;;) {
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_LOOKASIDE:
		case WT_REF_READING:
			WT_ASSERT(session, 0);		/* Impossible, assert */
			break;
		case WT_REF_DELETED:
			/*
			 * If the page is still "deleted", it's as we left it,
			 * reset the state.
			 */
			if (__wt_atomic_casv32(
			    &ref->state, WT_REF_DELETED, WT_REF_DISK))
				return;
			break;
		case WT_REF_LOCKED:
			/*
			 * A possible state, the page is being instantiated.
			 */
			break;
		case WT_REF_MEM:
		case WT_REF_SPLIT:
			/*
			 * We can't use the normal read path to get a copy of
			 * the page because the session may have closed the
			 * cursor, we no longer have the reference to the tree
			 * required for a hazard pointer.  We're safe because
			 * with unresolved transactions, the page isn't going
			 * anywhere.
			 *
			 * The page is in an in-memory state, walk the list of
			 * update structures and abort them.
			 */
			for (upd =
			    ref->page_del->update_list; *upd != NULL; ++upd)
				(*upd)->txnid = WT_TXN_ABORTED;

			/*
			 * Discard the memory, the transaction can't abort
			 * twice.
			 */
			__wt_free(session, ref->page_del->update_list);
			__wt_free(session, ref->page_del);
			return;
		}
		/*
		 * We wait for the change in page state, yield before retrying,
		 * and if we've yielded enough times, start sleeping so we don't
		 * burn CPU to no purpose.
		 */
		__wt_ref_state_yield_sleep(&yield_count, &sleep_count);
		WT_STAT_CONN_INCRV(session, page_del_rollback_blocked,
		    sleep_count);
	}
}
Esempio n. 10
0
/*
 * __wt_txn_recover --
 *	Run recovery.
 */
int
__wt_txn_recover(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *metac;
	WT_DECL_RET;
	WT_RECOVERY r;
	struct WT_RECOVERY_FILE *metafile;
	char *config;
	int needs_rec, was_backup;

	conn = S2C(session);
	WT_CLEAR(r);
	WT_INIT_LSN(&r.ckpt_lsn);
	was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0;

	/* We need a real session for recovery. */
	WT_RET(__wt_open_session(conn, NULL, NULL, &session));
	F_SET(session, WT_SESSION_NO_LOGGING);
	r.session = session;

	WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
	WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
	WT_ERR(__wt_metadata_cursor(session, NULL, &metac));
	metafile = &r.files[WT_METAFILE_ID];
	metafile->c = metac;

	/*
	 * If no log was found (including if logging is disabled), or if the
	 * last checkpoint was done with logging disabled, recovery should not
	 * run.  Scan the metadata to figure out the largest file ID.
	 */
	if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) ||
	    WT_IS_MAX_LSN(&metafile->ckpt_lsn)) {
		WT_ERR(__recovery_file_scan(&r));
		conn->next_file_id = r.max_fileid;
		goto done;
	}

	/*
	 * First, do a pass through the log to recover the metadata, and
	 * establish the last checkpoint LSN.  Skip this when opening a hot
	 * backup: we already have the correct metadata in that case.
	 */
	if (!was_backup) {
		r.metadata_only = 1;
		if (WT_IS_INIT_LSN(&metafile->ckpt_lsn))
			WT_ERR(__wt_log_scan(session,
			    NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r));
		else {
			/*
			 * Start at the last checkpoint LSN referenced in the
			 * metadata.  If we see the end of a checkpoint while
			 * scanning, we will change the full scan to start from
			 * there.
			 */
			r.ckpt_lsn = metafile->ckpt_lsn;
			WT_ERR(__wt_log_scan(session,
			    &metafile->ckpt_lsn, 0, __txn_log_recover, &r));
		}
	}

	/* Scan the metadata to find the live files and their IDs. */
	WT_ERR(__recovery_file_scan(&r));

	/*
	 * We no longer need the metadata cursor: close it to avoid pinning any
	 * resources that could block eviction during recovery.
	 */
	r.files[0].c = NULL;
	WT_ERR(metac->close(metac));

	/*
	 * Now, recover all the files apart from the metadata.
	 * Pass WT_LOGSCAN_RECOVER so that old logs get truncated.
	 */
	r.metadata_only = 0;
	WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY,
	    "Main recovery loop: starting at %u/%" PRIuMAX,
	    r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset));
	WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec));
	/*
	 * Check if the database was shut down cleanly.  If not
	 * return an error if the user does not want automatic
	 * recovery.
	 */
	if (needs_rec && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR))
		WT_ERR(WT_RUN_RECOVERY);
	/*
	 * Always run recovery even if it was a clean shutdown.
	 * We can consider skipping it in the future.
	 */
	if (WT_IS_INIT_LSN(&r.ckpt_lsn))
		WT_ERR(__wt_log_scan(session, NULL,
		    WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER,
		    __txn_log_recover, &r));
	else
		WT_ERR(__wt_log_scan(session, &r.ckpt_lsn,
		    WT_LOGSCAN_RECOVER, __txn_log_recover, &r));

	conn->next_file_id = r.max_fileid;

	/*
	 * If recovery ran successfully forcibly log a checkpoint so the next
	 * open is fast and keep the metadata up to date with the checkpoint
	 * LSN and archiving.
	 */
	WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));

done:
err:	WT_TRET(__recovery_free(&r));
	__wt_free(session, config);
	WT_TRET(session->iface.close(&session->iface, NULL));

	return (ret);
}
Esempio n. 11
0
/*
 * __wt_curds_open --
 *	Initialize a data-source cursor.
 */
int
__wt_curds_open(
    WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner,
    const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,		/* get-key */
	    __wt_cursor_get_value,		/* get-value */
	    __wt_cursor_set_key,		/* set-key */
	    __wt_cursor_set_value,		/* set-value */
	    __curds_compare,			/* compare */
	    __wt_cursor_equals,			/* equals */
	    __curds_next,			/* next */
	    __curds_prev,			/* prev */
	    __curds_reset,			/* reset */
	    __curds_search,			/* search */
	    __curds_search_near,		/* search-near */
	    __curds_insert,			/* insert */
	    __curds_update,			/* update */
	    __curds_remove,			/* remove */
	    __wt_cursor_reconfigure_notsup,	/* reconfigure */
	    __curds_close);			/* close */
	WT_CONFIG_ITEM cval, metadata;
	WT_CURSOR *cursor, *source;
	WT_CURSOR_DATA_SOURCE *data_source;
	WT_DECL_RET;
	char *metaconf;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_DATA_SOURCE, iface) == 0);

	data_source = NULL;
	metaconf = NULL;

	WT_RET(__wt_calloc_one(session, &data_source));
	cursor = &data_source->iface;
	*cursor = iface;
	cursor->session = &session->iface;

	/*
	 * XXX
	 * The underlying data-source may require the object's key and value
	 * formats.  This isn't a particularly elegant way of getting that
	 * information to the data-source, this feels like a layering problem
	 * to me.
	 */
	WT_ERR(__wt_metadata_search(session, uri, &metaconf));
	WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format));
	WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval));
	WT_ERR(
	    __wt_strndup(session, cval.str, cval.len, &cursor->value_format));

	WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));

	/* Data-source cursors may have a custom collator. */
	WT_ERR(
	    __wt_config_getones(session, metaconf, "app_metadata", &metadata));
	WT_ERR(__wt_config_gets_none(session, cfg, "collator", &cval));
	if (cval.len != 0)
		WT_ERR(__wt_collator_config(session, uri, &cval, &metadata,
		    &data_source->collator, &data_source->collator_owned));

	WT_ERR(dsrc->open_cursor(dsrc,
	    &session->iface, uri, (WT_CONFIG_ARG *)cfg, &data_source->source));
	source = data_source->source;
	source->session = (WT_SESSION *)session;
	memset(&source->q, 0, sizeof(source->q));
	source->recno = WT_RECNO_OOB;
	memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf));
	memset(&source->key, 0, sizeof(source->key));
	memset(&source->value, 0, sizeof(source->value));
	source->saved_err = 0;
	source->flags = 0;

	if (0) {
err:		WT_TRET(__curds_close(cursor));
		*cursorp = NULL;
	}

	__wt_free(session, metaconf);
	return (ret);
}
Esempio n. 12
0
/*
 * __wt_logmgr_destroy --
 *	Destroy the log archiving server thread and logging subsystem.
 */
int
__wt_logmgr_destroy(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_SESSION *wt_session;

	conn = S2C(session);

	if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) {
		/*
		 * We always set up the log_path so printlog can work without
		 * recovery. Therefore, always free it, even if logging isn't
		 * on.
		 */
		__wt_free(session, conn->log_path);
		return (0);
	}
	if (conn->log_tid_set) {
		WT_TRET(__wt_cond_auto_signal(session, conn->log_cond));
		WT_TRET(__wt_thread_join(session, conn->log_tid));
		conn->log_tid_set = false;
	}
	if (conn->log_file_tid_set) {
		WT_TRET(__wt_cond_signal(session, conn->log_file_cond));
		WT_TRET(__wt_thread_join(session, conn->log_file_tid));
		conn->log_file_tid_set = false;
	}
	if (conn->log_file_session != NULL) {
		wt_session = &conn->log_file_session->iface;
		WT_TRET(wt_session->close(wt_session, NULL));
		conn->log_file_session = NULL;
	}
	if (conn->log_wrlsn_tid_set) {
		WT_TRET(__wt_cond_auto_signal(session, conn->log_wrlsn_cond));
		WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid));
		conn->log_wrlsn_tid_set = false;
	}
	if (conn->log_wrlsn_session != NULL) {
		wt_session = &conn->log_wrlsn_session->iface;
		WT_TRET(wt_session->close(wt_session, NULL));
		conn->log_wrlsn_session = NULL;
	}

	WT_TRET(__wt_log_slot_destroy(session));
	WT_TRET(__wt_log_close(session));

	/* Close the server thread's session. */
	if (conn->log_session != NULL) {
		wt_session = &conn->log_session->iface;
		WT_TRET(wt_session->close(wt_session, NULL));
		conn->log_session = NULL;
	}

	/* Destroy the condition variables now that all threads are stopped */
	WT_TRET(__wt_cond_auto_destroy(session, &conn->log_cond));
	WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond));
	WT_TRET(__wt_cond_auto_destroy(session, &conn->log_wrlsn_cond));

	WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond));
	WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond));
	WT_TRET(__wt_rwlock_destroy(session, &conn->log->log_archive_lock));
	__wt_spin_destroy(session, &conn->log->log_lock);
	__wt_spin_destroy(session, &conn->log->log_slot_lock);
	__wt_spin_destroy(session, &conn->log->log_sync_lock);
	__wt_spin_destroy(session, &conn->log->log_writelsn_lock);
	__wt_free(session, conn->log_path);
	__wt_free(session, conn->log);
	return (ret);
}
Esempio n. 13
0
/*
 * __split_ref_deepen_move --
 *	Move a WT_REF from a parent to a child in service of a split to deepen
 * the tree, including updating the accounting information.
 */
static int
__split_ref_deepen_move(WT_SESSION_IMPL *session,
    WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp)
{
	WT_ADDR *addr;
	WT_CELL_UNPACK unpack;
	WT_DECL_RET;
	WT_IKEY *ikey;
	size_t size;
	void *key;

	/*
	 * Instantiate row-store keys, and column- and row-store addresses in
	 * the WT_REF structures referenced by a page that's being split (and
	 * deepening the tree).  The WT_REF structures aren't moving, but the
	 * index references are moving from the page we're splitting to a set
	 * of child pages, and so we can no longer reference the block image
	 * that remains with the page being split.
	 *
	 * No locking is required to update the WT_REF structure because we're
	 * the only thread splitting the parent page, and there's no way for
	 * readers to race with our updates of single pointers.  The changes
	 * have to be written before the page goes away, of course, our caller
	 * owns that problem.
	 *
	 * Row-store keys, first.
	 */
	if (parent->type == WT_PAGE_ROW_INT) {
		if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) {
			__wt_ref_key(parent, ref, &key, &size);
			WT_RET(__wt_row_ikey(session, 0, key, size, ref));
			ikey = ref->key.ikey;
		} else {
			WT_RET(__split_ovfl_key_cleanup(session, parent, ref));
			*parent_decrp += sizeof(WT_IKEY) + ikey->size;
		}
		*child_incrp += sizeof(WT_IKEY) + ikey->size;
	}

	/*
	 * If there's no address (the page has never been written), or the
	 * address has been instantiated, there's no work to do.  Otherwise,
	 * get the address from the on-page cell.
	 */
	addr = ref->addr;
	if (addr != NULL && !__wt_off_page(parent, addr)) {
		__wt_cell_unpack((WT_CELL *)ref->addr, &unpack);
		WT_RET(__wt_calloc_one(session, &addr));
		if ((ret = __wt_strndup(
		    session, unpack.data, unpack.size, &addr->addr)) != 0) {
			__wt_free(session, addr);
			return (ret);
		}
		addr->size = (uint8_t)unpack.size;
		addr->type =
		    unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF;
		ref->addr = addr;
	}

	/* And finally, the WT_REF itself. */
	WT_MEM_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF));

	return (0);
}
Esempio n. 14
0
/*
 * __wt_realloc_aligned --
 *	ANSI realloc function that aligns to buffer boundaries, configured with
 *	the "buffer_alignment" key to wiredtiger_open.
 */
int
__wt_realloc_aligned(WT_SESSION_IMPL *session,
    size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp)
{
#if defined(HAVE_POSIX_MEMALIGN)
	WT_DECL_RET;

	/*
	 * !!!
	 * This function MUST handle a NULL WT_SESSION_IMPL handle.
	 */
	if (session != NULL && S2C(session)->buffer_alignment > 0) {
		void *p, *newp;
		size_t bytes_allocated;

		/*
		 * Sometimes we're allocating memory and we don't care about the
		 * final length -- bytes_allocated_ret may be NULL.
		 */
		p = *(void **)retp;
		bytes_allocated =
		    (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret;
		WT_ASSERT(session,
		    (p == NULL && bytes_allocated == 0) ||
		    (p != NULL &&
		    (bytes_allocated_ret == NULL || bytes_allocated != 0)));
		WT_ASSERT(session, bytes_to_allocate != 0);
		WT_ASSERT(session, bytes_allocated < bytes_to_allocate);

		if (session != NULL)
			WT_STAT_FAST_CONN_INCR(session, memory_allocation);

		if ((ret = posix_memalign(&newp,
		    S2C(session)->buffer_alignment,
		    bytes_to_allocate)) != 0)
			WT_RET_MSG(session, ret, "memory allocation");

		if (p != NULL)
			memcpy(newp, p, bytes_allocated);
		__wt_free(session, p);
		p = newp;

		/* Clear the allocated memory (see above). */
		memset((uint8_t *)p + bytes_allocated, 0,
		    bytes_to_allocate - bytes_allocated);

		/* Update caller's bytes allocated value. */
		if (bytes_allocated_ret != NULL)
			*bytes_allocated_ret = bytes_to_allocate;

		*(void **)retp = p;
		return (0);
	}
#endif
	/*
	 * If there is no posix_memalign function, or no alignment configured,
	 * fall back to realloc.
	 */
	return (__wt_realloc(
	    session, bytes_allocated_ret, bytes_to_allocate, retp));
}
Esempio n. 15
0
/*
 * __create_colgroup --
 *	Create a column group.
 */
static int
__create_colgroup(WT_SESSION_IMPL *session,
    const char *name, bool exclusive, const char *config)
{
	WT_CONFIG_ITEM cval;
	WT_DECL_RET;
	WT_ITEM confbuf, fmt, namebuf;
	WT_TABLE *table;
	size_t tlen;
	const char **cfgp, *cfg[4] =
	    { WT_CONFIG_BASE(session, colgroup_meta), config, NULL, NULL };
	const char *sourcecfg[] = { config, NULL, NULL };
	const char *cgname, *source, *sourceconf, *tablename;
	char *cgconf, *origconf;
	bool exists;

	sourceconf = NULL;
	cgconf = origconf = NULL;
	WT_CLEAR(fmt);
	WT_CLEAR(confbuf);
	WT_CLEAR(namebuf);
	exists = false;

	tablename = name;
	if (!WT_PREFIX_SKIP(tablename, "colgroup:"))
		return (
		    __wt_unexpected_object_type(session, name, "colgroup:"));
	cgname = strchr(tablename, ':');
	if (cgname != NULL) {
		tlen = (size_t)(cgname - tablename);
		++cgname;
	} else
		tlen = strlen(tablename);

	if ((ret =
	    __wt_schema_get_table(session, tablename, tlen, true, &table)) != 0)
		WT_RET_MSG(session, (ret == WT_NOTFOUND) ? ENOENT : ret,
		    "Can't create '%s' for non-existent table '%.*s'",
		    name, (int)tlen, tablename);

	/* Make sure the column group is referenced from the table. */
	if (cgname != NULL && (ret =
	    __wt_config_subgets(session, &table->cgconf, cgname, &cval)) != 0)
		WT_ERR_MSG(session, EINVAL,
		    "Column group '%s' not found in table '%.*s'",
		    cgname, (int)tlen, tablename);

	/* Check if the column group already exists. */
	if ((ret = __wt_metadata_search(session, name, &origconf)) == 0) {
		if (exclusive)
			WT_ERR(EEXIST);
		exists = true;
	}
	WT_ERR_NOTFOUND_OK(ret);

	/* Find the first NULL entry in the cfg stack. */
	for (cfgp = &cfg[1]; *cfgp; cfgp++)
		;

	/* Add the source to the colgroup config before collapsing. */
	if (__wt_config_getones(
	    session, config, "source", &cval) == 0 && cval.len != 0) {
		WT_ERR(__wt_buf_fmt(
		    session, &namebuf, "%.*s", (int)cval.len, cval.str));
		source = namebuf.data;
	} else {
		WT_ERR(__wt_schema_colgroup_source(
		    session, table, cgname, config, &namebuf));
		source = namebuf.data;
		WT_ERR(__wt_buf_fmt(
		    session, &confbuf, "source=\"%s\"", source));
		*cfgp++ = confbuf.data;
	}

	/* Calculate the key/value formats: these go into the source config. */
	WT_ERR(__wt_buf_fmt(session, &fmt, "key_format=%s", table->key_format));
	if (cgname == NULL)
		WT_ERR(__wt_buf_catfmt
		    (session, &fmt, ",value_format=%s", table->value_format));
	else {
		if (__wt_config_getones(session, config, "columns", &cval) != 0)
			WT_ERR_MSG(session, EINVAL,
			    "No 'columns' configuration for '%s'", name);
		WT_ERR(__wt_buf_catfmt(session, &fmt, ",value_format="));
		WT_ERR(__wt_struct_reformat(session,
		    table, cval.str, cval.len, NULL, true, &fmt));
	}
	sourcecfg[1] = fmt.data;
	WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf));
	WT_ERR(__wt_schema_create(session, source, sourceconf));

	WT_ERR(__wt_config_collapse(session, cfg, &cgconf));

	if (!exists) {
		WT_ERR(__wt_metadata_insert(session, name, cgconf));
		WT_ERR(__wt_schema_open_colgroups(session, table));
	}

err:	__wt_free(session, cgconf);
	__wt_free(session, sourceconf);
	__wt_free(session, origconf);
	__wt_buf_free(session, &confbuf);
	__wt_buf_free(session, &fmt);
	__wt_buf_free(session, &namebuf);

	__wt_schema_release_table(session, table);
	return (ret);
}
Esempio n. 16
0
/*
 * __wt_delete_page_instantiate --
 *	Instantiate an entirely deleted row-store leaf page.
 */
int
__wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_DELETED *page_del;
	WT_UPDATE **upd_array, *upd;
	size_t size;
	uint32_t i;

	btree = S2BT(session);
	page = ref->page;
	page_del = ref->page_del;

	/*
	 * Give the page a modify structure.
	 *
	 * If the tree is already dirty and so will be written, mark the page
	 * dirty.  (We'd like to free the deleted pages, but if the handle is
	 * read-only or if the application never modifies the tree, we're not
	 * able to do so.)
	 */
	WT_RET(__wt_page_modify_init(session, page));
	if (btree->modified)
		__wt_page_modify_set(session, page);

	/*
	 * An operation is accessing a "deleted" page, and we're building an
	 * in-memory version of the page (making it look like all entries in
	 * the page were individually updated by a remove operation).  There
	 * are two cases where we end up here:
	 *
	 * First, a running transaction used a truncate call to delete the page
	 * without reading it, in which case the page reference includes a
	 * structure with a transaction ID; the page we're building might split
	 * in the future, so we update that structure to include references to
	 * all of the update structures we create, so the transaction can abort.
	 *
	 * Second, a truncate call deleted a page and the truncate committed,
	 * but an older transaction in the system forced us to keep the old
	 * version of the page around, then we crashed and recovered, and now
	 * we're being forced to read that page.
	 *
	 * In the first case, we have a page reference structure, in the second
	 * second, we don't.
	 *
	 * Allocate the per-reference update array; in the case of instantiating
	 * a page, deleted by a running transaction that might eventually abort,
	 * we need a list of the update structures so we can do that abort.  The
	 * hard case is if a page splits: the update structures might be moved
	 * to different pages, and we still have to find them all for an abort.
	 */
	if (page_del != NULL)
		WT_RET(__wt_calloc_def(
		    session, page->entries + 1, &page_del->update_list));

	/* Allocate the per-page update array. */
	WT_ERR(__wt_calloc_def(session, page->entries, &upd_array));
	page->modify->mod_row_update = upd_array;

	/*
	 * Fill in the per-reference update array with references to update
	 * structures, fill in the per-page update array with references to
	 * deleted items.
	 */
	for (i = 0, size = 0; i < page->entries; ++i) {
		WT_ERR(__wt_calloc_one(session, &upd));
		upd->type = WT_UPDATE_DELETED;

		if (page_del == NULL)
			upd->txnid = WT_TXN_NONE;	/* Globally visible */
		else {
			upd->txnid = page_del->txnid;
			page_del->update_list[i] = upd;
		}

		upd->next = upd_array[i];
		upd_array[i] = upd;

		size += sizeof(WT_UPDATE *) + WT_UPDATE_MEMSIZE(upd);
	}

	__wt_cache_page_inmem_incr(session, page, size);

	return (0);

err:	/*
	 * There's no need to free the page update structures on error, our
	 * caller will discard the page and do that work for us.  We could
	 * similarly leave the per-reference update array alone because it
	 * won't ever be used by any page that's not in-memory, but cleaning
	 * it up makes sense, especially if we come back in to this function
	 * attempting to instantiate this page again.
	 */
	if (page_del != NULL)
		__wt_free(session, page_del->update_list);
	return (ret);
}
Esempio n. 17
0
/*
 * __create_index --
 *	Create an index.
 */
static int
__create_index(WT_SESSION_IMPL *session,
    const char *name, bool exclusive, const char *config)
{
	WT_CONFIG kcols, pkcols;
	WT_CONFIG_ITEM ckey, cval, icols, kval;
	WT_DECL_PACK_VALUE(pv);
	WT_DECL_RET;
	WT_INDEX *idx;
	WT_ITEM confbuf, extra_cols, fmt, namebuf;
	WT_PACK pack;
	WT_TABLE *table;
	const char *cfg[4] =
	    { WT_CONFIG_BASE(session, index_meta), NULL, NULL, NULL };
	const char *sourcecfg[] = { config, NULL, NULL };
	const char *source, *sourceconf, *idxname, *tablename;
	char *idxconf, *origconf;
	size_t tlen;
	bool exists, have_extractor;
	u_int i, npublic_cols;

	sourceconf = NULL;
	idxconf = origconf = NULL;
	WT_CLEAR(confbuf);
	WT_CLEAR(fmt);
	WT_CLEAR(extra_cols);
	WT_CLEAR(namebuf);
	exists = have_extractor = false;

	tablename = name;
	if (!WT_PREFIX_SKIP(tablename, "index:"))
		return (__wt_unexpected_object_type(session, name, "index:"));
	idxname = strchr(tablename, ':');
	if (idxname == NULL)
		WT_RET_MSG(session, EINVAL, "Invalid index name, "
		    "should be <table name>:<index name>: %s", name);

	tlen = (size_t)(idxname++ - tablename);
	if ((ret =
	    __wt_schema_get_table(session, tablename, tlen, true, &table)) != 0)
		WT_RET_MSG(session, ret,
		    "Can't create an index for a non-existent table: %.*s",
		    (int)tlen, tablename);

	if (table->is_simple)
		WT_ERR_MSG(session, EINVAL,
		    "%s requires a table with named columns", name);

	/* Check if the index already exists. */
	if ((ret = __wt_metadata_search(session, name, &origconf)) == 0) {
		if (exclusive)
			WT_ERR(EEXIST);
		exists = true;
	}
	WT_ERR_NOTFOUND_OK(ret);

	if (__wt_config_getones(session, config, "source", &cval) == 0) {
		WT_ERR(__wt_buf_fmt(session, &namebuf,
		    "%.*s", (int)cval.len, cval.str));
		source = namebuf.data;
	} else {
		WT_ERR(__wt_schema_index_source(
		    session, table, idxname, config, &namebuf));
		source = namebuf.data;

		/* Add the source name to the index config before collapsing. */
		WT_ERR(__wt_buf_catfmt(session, &confbuf,
		    ",source=\"%s\"", source));
	}

	if (__wt_config_getones_none(
	    session, config, "extractor", &cval) == 0 && cval.len != 0) {
		have_extractor = true;
		/* Custom extractors must supply a key format. */
		if ((ret = __wt_config_getones(
		    session, config, "key_format", &kval)) != 0)
			WT_ERR_MSG(session, EINVAL,
			    "%s: custom extractors require a key_format", name);
	}

	/* Calculate the key/value formats. */
	WT_CLEAR(icols);
	if (__wt_config_getones(session, config, "columns", &icols) != 0 &&
	    !have_extractor)
		WT_ERR_MSG(session, EINVAL,
		    "%s: requires 'columns' configuration", name);

	/*
	 * Count the public columns using the declared columns for normal
	 * indices or the key format for custom extractors.
	 */
	npublic_cols = 0;
	if (!have_extractor) {
		__wt_config_subinit(session, &kcols, &icols);
		while ((ret = __wt_config_next(&kcols, &ckey, &cval)) == 0)
			++npublic_cols;
		WT_ERR_NOTFOUND_OK(ret);
	} else {
		WT_ERR(__pack_initn(session, &pack, kval.str, kval.len));
		while ((ret = __pack_next(&pack, &pv)) == 0)
			++npublic_cols;
		WT_ERR_NOTFOUND_OK(ret);
	}

	/*
	 * The key format for an index is somewhat subtle: the application
	 * specifies a set of columns that it will use for the key, but the
	 * engine usually adds some hidden columns in order to derive the
	 * primary key.  These hidden columns are part of the source's
	 * key_format, which we are calculating now, but not part of an index
	 * cursor's key_format.
	 */
	__wt_config_subinit(session, &pkcols, &table->colconf);
	for (i = 0; i < table->nkey_columns &&
	    (ret = __wt_config_next(&pkcols, &ckey, &cval)) == 0;
	    i++) {
		/*
		 * If the primary key column is already in the secondary key,
		 * don't add it again.
		 */
		if (__wt_config_subgetraw(session, &icols, &ckey, &cval) == 0) {
			if (have_extractor)
				WT_ERR_MSG(session, EINVAL,
				    "an index with a custom extractor may not "
				    "include primary key columns");
			continue;
		}
		WT_ERR(__wt_buf_catfmt(
		    session, &extra_cols, "%.*s,", (int)ckey.len, ckey.str));
	}
	WT_ERR_NOTFOUND_OK(ret);

	/* Index values are empty: all columns are packed into the index key. */
	WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,key_format="));

	if (have_extractor) {
		WT_ERR(__wt_buf_catfmt(session, &fmt, "%.*s",
		    (int)kval.len, kval.str));
		WT_CLEAR(icols);
	}

	/*
	 * Construct the index key format, or append the primary key columns
	 * for custom extractors.
	 */
	WT_ERR(__wt_struct_reformat(session, table,
	    icols.str, icols.len, (const char *)extra_cols.data, false, &fmt));

	/* Check for a record number index key, which makes no sense. */
	WT_ERR(__wt_config_getones(session, fmt.data, "key_format", &cval));
	if (cval.len == 1 && cval.str[0] == 'r')
		WT_ERR_MSG(session, EINVAL,
		    "column-store index may not use the record number as its "
		    "index key");

	WT_ERR(__wt_buf_catfmt(
	    session, &fmt, ",index_key_columns=%u", npublic_cols));

	sourcecfg[1] = fmt.data;
	WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf));

	WT_ERR(__wt_schema_create(session, source, sourceconf));

	cfg[1] = sourceconf;
	cfg[2] = confbuf.data;
	WT_ERR(__wt_config_collapse(session, cfg, &idxconf));

	if (!exists) {
		WT_ERR(__wt_metadata_insert(session, name, idxconf));

		/* Make sure that the configuration is valid. */
		WT_ERR(__wt_schema_open_index(
		    session, table, idxname, strlen(idxname), &idx));

		/* If there is data in the table, fill the index. */
		WT_ERR(__fill_index(session, table, idx));
	}

err:	__wt_free(session, idxconf);
	__wt_free(session, origconf);
	__wt_free(session, sourceconf);
	__wt_buf_free(session, &confbuf);
	__wt_buf_free(session, &extra_cols);
	__wt_buf_free(session, &fmt);
	__wt_buf_free(session, &namebuf);

	__wt_schema_release_table(session, table);
	return (ret);
}
Esempio n. 18
0
/*
 * __wt_delete_page --
 *	If deleting a range, try to delete the page without instantiating it.
 */
int
__wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
	WT_DECL_RET;
	WT_PAGE *parent;

	*skipp = false;

	/* If we have a clean page in memory, attempt to evict it. */
	if (ref->state == WT_REF_MEM &&
	    __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
		if (__wt_page_is_modified(ref->page)) {
			WT_PUBLISH(ref->state, WT_REF_MEM);
			return (0);
		}

		(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
		ret = __wt_evict(session, ref, false);
		(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
		WT_RET_BUSY_OK(ret);
	}

	/*
	 * Atomically switch the page's state to lock it.  If the page is not
	 * on-disk, other threads may be using it, no fast delete.
	 */
	if (ref->state != WT_REF_DISK ||
	    !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED))
		return (0);

	/*
	 * We cannot fast-delete pages that have overflow key/value items as
	 * the overflow blocks have to be discarded.  The way we figure that
	 * out is to check the page's cell type, cells for leaf pages without
	 * overflow items are special.
	 *
	 * To look at an on-page cell, we need to look at the parent page, and
	 * that's dangerous, our parent page could change without warning if
	 * the parent page were to split, deepening the tree.  It's safe: the
	 * page's reference will always point to some valid page, and if we find
	 * any problems we simply fail the fast-delete optimization.
	 */
	parent = ref->home;
	if (__wt_off_page(parent, ref->addr) ?
	    ((WT_ADDR *)ref->addr)->type != WT_ADDR_LEAF_NO :
	    __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO)
		goto err;

	/*
	 * This action dirties the parent page: mark it dirty now, there's no
	 * future reconciliation of the child leaf page that will dirty it as
	 * we write the tree.
	 */
	WT_ERR(__wt_page_parent_modify_set(session, ref, false));

	/*
	 * Record the change in the transaction structure and set the change's
	 * transaction ID.
	 */
	WT_ERR(__wt_calloc_one(session, &ref->page_del));
	ref->page_del->txnid = session->txn.id;

	WT_ERR(__wt_txn_modify_ref(session, ref));

	*skipp = true;
	WT_STAT_CONN_INCR(session, rec_page_delete_fast);
	WT_STAT_DATA_INCR(session, rec_page_delete_fast);
	WT_PUBLISH(ref->state, WT_REF_DELETED);
	return (0);

err:	__wt_free(session, ref->page_del);

	/*
	 * Restore the page to on-disk status, we'll have to instantiate it.
	 */
	WT_PUBLISH(ref->state, WT_REF_DISK);
	return (ret);
}
Esempio n. 19
0
/*
 * __create_table --
 *	Create a table.
 */
static int
__create_table(WT_SESSION_IMPL *session,
    const char *name, bool exclusive, const char *config)
{
	WT_CONFIG conf;
	WT_CONFIG_ITEM cgkey, cgval, cval;
	WT_DECL_RET;
	WT_TABLE *table;
	const char *cfg[4] =
	    { WT_CONFIG_BASE(session, table_meta), config, NULL, NULL };
	const char *tablename;
	char *tableconf, *cgname;
	size_t cgsize;
	int ncolgroups;
	bool exists;

	cgname = NULL;
	table = NULL;
	tableconf = NULL;
	exists = false;

	tablename = name;
	if (!WT_PREFIX_SKIP(tablename, "table:"))
		return (__wt_unexpected_object_type(session, name, "table:"));

	if ((ret = __wt_schema_get_table(session,
	    tablename, strlen(tablename), false, &table)) == 0) {
		if (exclusive)
			WT_ERR(EEXIST);
		exists = true;
	}
	WT_ERR_NOTFOUND_OK(ret);

	WT_ERR(__wt_config_gets(session, cfg, "colgroups", &cval));
	__wt_config_subinit(session, &conf, &cval);
	for (ncolgroups = 0;
	    (ret = __wt_config_next(&conf, &cgkey, &cgval)) == 0;
	    ncolgroups++)
		;
	WT_ERR_NOTFOUND_OK(ret);

	WT_ERR(__wt_config_collapse(session, cfg, &tableconf));

	if (!exists) {
		WT_ERR(__wt_metadata_insert(session, name, tableconf));

		/* Attempt to open the table now to catch any errors. */
		WT_ERR(__wt_schema_get_table(
		    session, tablename, strlen(tablename), true, &table));

		if (ncolgroups == 0) {
			cgsize = strlen("colgroup:") + strlen(tablename) + 1;
			WT_ERR(__wt_calloc_def(session, cgsize, &cgname));
			snprintf(cgname, cgsize, "colgroup:%s", tablename);
			WT_ERR(__create_colgroup(
			    session, cgname, exclusive, config));
		}
	}

	if (0) {
err:		if (table != NULL) {
			WT_TRET(__wt_schema_remove_table(session, table));
			table = NULL;
		}
	}
	if (table != NULL)
		__wt_schema_release_table(session, table);
	__wt_free(session, cgname);
	__wt_free(session, tableconf);
	return (ret);
}
Esempio n. 20
0
/*
 * __wt_curlog_open --
 *	Initialize a log cursor.
 */
int
__wt_curlog_open(WT_SESSION_IMPL *session,
    const char *uri, const char *cfg[], WT_CURSOR **cursorp)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,	/* get-key */
	    __wt_cursor_get_value,	/* get-value */
	    __wt_cursor_set_key,	/* set-key */
	    __wt_cursor_set_value,	/* set-value */
	    __curlog_compare,		/* compare */
	    __wt_cursor_equals,		/* equals */
	    __curlog_next,		/* next */
	    __wt_cursor_notsup,		/* prev */
	    __curlog_reset,		/* reset */
	    __curlog_search,		/* search */
	    __wt_cursor_notsup,		/* search-near */
	    __wt_cursor_notsup,		/* insert */
	    __wt_cursor_notsup,		/* update */
	    __wt_cursor_notsup,		/* remove */
	    __wt_cursor_notsup,		/* reconfigure */
	    __curlog_close);		/* close */
	WT_CURSOR *cursor;
	WT_CURSOR_LOG *cl;
	WT_DECL_RET;
	WT_LOG *log;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_LOG, iface) == 0);
	conn = S2C(session);
	if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
		WT_RET_MSG(session, EINVAL,
		    "Cannot open a log cursor without logging enabled");

	log = conn->log;
	cl = NULL;
	WT_RET(__wt_calloc_one(session, &cl));
	cursor = &cl->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	WT_ERR(__wt_calloc_one(session, &cl->cur_lsn));
	WT_ERR(__wt_calloc_one(session, &cl->next_lsn));
	WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec));
	WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey));
	WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue));
	cursor->key_format = LOGC_KEY_FORMAT;
	cursor->value_format = LOGC_VALUE_FORMAT;

	WT_INIT_LSN(cl->cur_lsn);
	WT_INIT_LSN(cl->next_lsn);

	WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));

	/* Log cursors block archiving. */
	WT_ERR(__wt_readlock(session, log->log_archive_lock));

	if (0) {
err:		if (F_ISSET(cursor, WT_CURSTD_OPEN))
			WT_TRET(cursor->close(cursor));
		else {
			__wt_free(session, cl->cur_lsn);
			__wt_free(session, cl->next_lsn);
			__wt_scr_free(session, &cl->logrec);
			__wt_scr_free(session, &cl->opkey);
			__wt_scr_free(session, &cl->opvalue);
			/*
			 * NOTE:  We cannot get on the error path with the
			 * readlock held.  No need to unlock it unless that
			 * changes above.
			 */
			__wt_free(session, cl);
		}
		*cursorp = NULL;
	}

	return (ret);
}
Esempio n. 21
0
/*
 * __free_page_col_var --
 *	Discard a WT_PAGE_COL_VAR page.
 */
static void
__free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	/* Free the RLE lookup array. */
	__wt_free(session, page->u.col_var.repeats);
}
Esempio n. 22
0
/*
 * __wt_dirlist --
 *	Get a list of files from a directory, optionally filtered by
 *	a given prefix.
 */
int
__wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix,
    uint32_t flags, char ***dirlist, u_int *countp)
{
	struct dirent *dp;
	DIR *dirp;
	WT_DECL_RET;
	size_t dirallocsz;
	u_int count, dirsz;
	bool match;
	char **entries, *path;

	*dirlist = NULL;
	*countp = 0;

	WT_RET(__wt_filename(session, dir, &path));

	dirp = NULL;
	dirallocsz = 0;
	dirsz = 0;
	entries = NULL;
	if (flags == 0)
		LF_SET(WT_DIRLIST_INCLUDE);

	WT_ERR(__wt_verbose(session, WT_VERB_FILEOPS,
	    "wt_dirlist of %s %s prefix %s",
	    path, LF_ISSET(WT_DIRLIST_INCLUDE) ? "include" : "exclude",
	    prefix == NULL ? "all" : prefix));

	WT_SYSCALL_RETRY(((dirp = opendir(path)) == NULL ? 1 : 0), ret);
	if (ret != 0)
		WT_ERR_MSG(session, ret, "%s: opendir", path);
	for (dirsz = 0, count = 0; (dp = readdir(dirp)) != NULL;) {
		/*
		 * Skip . and ..
		 */
		if (strcmp(dp->d_name, ".") == 0 ||
		    strcmp(dp->d_name, "..") == 0)
			continue;
		match = false;
		if (prefix != NULL &&
		    ((LF_ISSET(WT_DIRLIST_INCLUDE) &&
		    WT_PREFIX_MATCH(dp->d_name, prefix)) ||
		    (LF_ISSET(WT_DIRLIST_EXCLUDE) &&
		    !WT_PREFIX_MATCH(dp->d_name, prefix))))
			match = true;
		if (prefix == NULL || match) {
			/*
			 * We have a file name we want to return.
			 */
			count++;
			if (count > dirsz) {
				dirsz += WT_DIR_ENTRY;
				WT_ERR(__wt_realloc_def(
				    session, &dirallocsz, dirsz, &entries));
			}
			WT_ERR(__wt_strdup(
			    session, dp->d_name, &entries[count-1]));
		}
	}
	if (count > 0)
		*dirlist = entries;
	*countp = count;
err:
	if (dirp != NULL)
		(void)closedir(dirp);
	__wt_free(session, path);

	if (ret == 0)
		return (0);

	if (*dirlist != NULL) {
		for (count = dirsz; count > 0; count--)
			__wt_free(session, entries[count]);
		__wt_free(session, entries);
	}
	WT_RET_MSG(session, ret, "dirlist %s prefix %s", dir, prefix);
}
Esempio n. 23
0
/*
 * __wt_txn_recover --
 *	Run recovery.
 */
int
__wt_txn_recover(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *metac;
	WT_DECL_RET;
	WT_RECOVERY r;
	WT_RECOVERY_FILE *metafile;
	char *config;
	bool do_checkpoint, eviction_started, needs_rec, was_backup;

	conn = S2C(session);
	WT_CLEAR(r);
	WT_INIT_LSN(&r.ckpt_lsn);
	config = NULL;
	do_checkpoint = true;
	eviction_started = false;
	was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP);

	/* We need a real session for recovery. */
	WT_RET(__wt_open_internal_session(conn, "txn-recover",
	    false, WT_SESSION_NO_LOGGING, &session));
	r.session = session;
	WT_MAX_LSN(&r.max_ckpt_lsn);
	WT_MAX_LSN(&r.max_rec_lsn);
	conn->txn_global.recovery_timestamp =
	    conn->txn_global.meta_ckpt_timestamp = 0;

	F_SET(conn, WT_CONN_RECOVERING);
	WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
	WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
	WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac));
	metafile = &r.files[WT_METAFILE_ID];
	metafile->c = metac;

	/*
	 * If no log was found (including if logging is disabled), or if the
	 * last checkpoint was done with logging disabled, recovery should not
	 * run.  Scan the metadata to figure out the largest file ID.
	 */
	if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_EXISTED) ||
	    WT_IS_MAX_LSN(&metafile->ckpt_lsn)) {
		/*
		 * Detect if we're going from logging disabled to enabled.
		 * We need to know this to verify LSNs and start at the correct
		 * log file later.  If someone ran with logging, then disabled
		 * it and removed all the log files and then turned logging back
		 * on, we have to start logging in the log file number that is
		 * larger than any checkpoint LSN we have from the earlier time.
		 */
		WT_ERR(__recovery_file_scan(&r));
		/*
		 * The array can be re-allocated in recovery_file_scan.  Reset
		 * our pointer after scanning all the files.
		 */
		metafile = &r.files[WT_METAFILE_ID];
		conn->next_file_id = r.max_fileid;

		if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
		    WT_IS_MAX_LSN(&metafile->ckpt_lsn) &&
		    !WT_IS_MAX_LSN(&r.max_ckpt_lsn))
			WT_ERR(__wt_log_reset(session, r.max_ckpt_lsn.l.file));
		else
			do_checkpoint = false;
		goto done;
	}

	/*
	 * First, do a pass through the log to recover the metadata, and
	 * establish the last checkpoint LSN.  Skip this when opening a hot
	 * backup: we already have the correct metadata in that case.
	 *
	 * If we're running with salvage and we hit an error, we ignore it
	 * and continue. In salvage we want to recover whatever part of the
	 * data we can from the last checkpoint up until whatever problem we
	 * detect in the log file. In salvage, we ignore errors from scanning
	 * the log so recovery can continue. Other errors remain errors.
	 */
	if (!was_backup) {
		r.metadata_only = true;
		/*
		 * If this is a read-only connection, check if the checkpoint
		 * LSN in the metadata file is up to date, indicating a clean
		 * shutdown.
		 */
		if (F_ISSET(conn, WT_CONN_READONLY)) {
			WT_ERR(__wt_log_needs_recovery(
			    session, &metafile->ckpt_lsn, &needs_rec));
			if (needs_rec)
				WT_ERR_MSG(session, WT_RUN_RECOVERY,
				    "Read-only database needs recovery");
		}
		if (WT_IS_INIT_LSN(&metafile->ckpt_lsn))
			ret = __wt_log_scan(session,
			    NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r);
		else {
			/*
			 * Start at the last checkpoint LSN referenced in the
			 * metadata.  If we see the end of a checkpoint while
			 * scanning, we will change the full scan to start from
			 * there.
			 */
			r.ckpt_lsn = metafile->ckpt_lsn;
			ret = __wt_log_scan(session,
			    &metafile->ckpt_lsn, 0, __txn_log_recover, &r);
		}
		if (F_ISSET(conn, WT_CONN_SALVAGE))
			ret = 0;
		/*
		 * If log scan couldn't find a file we expected to be around,
		 * this indicates a corruption of some sort.
		 */
		if (ret == ENOENT) {
			F_SET(conn, WT_CONN_DATA_CORRUPTION);
			ret = WT_ERROR;
		}

		WT_ERR(ret);
	}

	/* Scan the metadata to find the live files and their IDs. */
	WT_ERR(__recovery_file_scan(&r));
	/*
	 * Clear this out.  We no longer need it and it could have been
	 * re-allocated when scanning the files.
	 */
	WT_NOT_READ(metafile, NULL);

	/*
	 * We no longer need the metadata cursor: close it to avoid pinning any
	 * resources that could block eviction during recovery.
	 */
	r.files[0].c = NULL;
	WT_ERR(metac->close(metac));

	/*
	 * Now, recover all the files apart from the metadata.
	 * Pass WT_LOGSCAN_RECOVER so that old logs get truncated.
	 */
	r.metadata_only = false;
	__wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS,
	    "Main recovery loop: starting at %" PRIu32 "/%" PRIu32
	    " to %" PRIu32 "/%" PRIu32, r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset,
	    r.max_rec_lsn.l.file, r.max_rec_lsn.l.offset);
	WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec));
	/*
	 * Check if the database was shut down cleanly.  If not
	 * return an error if the user does not want automatic
	 * recovery.
	 */
	if (needs_rec &&
	    (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) ||
	     F_ISSET(conn, WT_CONN_READONLY))) {
		if (F_ISSET(conn, WT_CONN_READONLY))
			WT_ERR_MSG(session, WT_RUN_RECOVERY,
			    "Read-only database needs recovery");
		WT_ERR_MSG(session, WT_RUN_RECOVERY, "Database needs recovery");
	}

	if (F_ISSET(conn, WT_CONN_READONLY)) {
		do_checkpoint = false;
		goto done;
	}

	/*
	 * Recovery can touch more data than fits in cache, so it relies on
	 * regular eviction to manage paging.  Start eviction threads for
	 * recovery without LAS cursors.
	 */
	WT_ERR(__wt_evict_create(session));
	eviction_started = true;

	/*
	 * Always run recovery even if it was a clean shutdown only if
	 * this is not a read-only connection.
	 * We can consider skipping it in the future.
	 */
	if (needs_rec)
		FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY);
	if (WT_IS_INIT_LSN(&r.ckpt_lsn))
		ret = __wt_log_scan(session, NULL,
		    WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER,
		    __txn_log_recover, &r);
	else
		ret = __wt_log_scan(session, &r.ckpt_lsn,
		    WT_LOGSCAN_RECOVER, __txn_log_recover, &r);
	if (F_ISSET(conn, WT_CONN_SALVAGE))
		ret = 0;
	WT_ERR(ret);

	conn->next_file_id = r.max_fileid;

done:	WT_ERR(__recovery_set_checkpoint_timestamp(&r));
	if (do_checkpoint)
		/*
		 * Forcibly log a checkpoint so the next open is fast and keep
		 * the metadata up to date with the checkpoint LSN and
		 * archiving.
		 */
		WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));

	/*
	 * If we're downgrading and have newer log files, force an archive,
	 * no matter what the archive setting is.
	 */
	if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE))
		WT_ERR(__wt_log_truncate_files(session, NULL, true));
	FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE);

err:	WT_TRET(__recovery_free(&r));
	__wt_free(session, config);
	FLD_CLR(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY);

	if (ret != 0) {
		FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_FAILED);
		__wt_err(session, ret, "Recovery failed");
	}

	/*
	 * Destroy the eviction threads that were started in support of
	 * recovery.  They will be restarted once the lookaside table is
	 * created.
	 */
	if (eviction_started)
		WT_TRET(__wt_evict_destroy(session));

	WT_TRET(session->iface.close(&session->iface, NULL));
	F_CLR(conn, WT_CONN_RECOVERING);

	return (ret);
}
Esempio n. 24
0
/*
 * __wt_page_alloc --
 *	Create or read a page into the cache.
 */
int
__wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type,
    uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep)
{
	WT_CACHE *cache;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_INDEX *pindex;
	size_t size;
	uint32_t i;
	void *p;

	*pagep = NULL;

	cache = S2C(session)->cache;
	page = NULL;

	size = sizeof(WT_PAGE);
	switch (type) {
	case WT_PAGE_COL_FIX:
	case WT_PAGE_COL_INT:
	case WT_PAGE_ROW_INT:
		break;
	case WT_PAGE_COL_VAR:
		/*
		 * Variable-length column-store leaf page: allocate memory to
		 * describe the page's contents with the initial allocation.
		 */
		size += alloc_entries * sizeof(WT_COL);
		break;
	case WT_PAGE_ROW_LEAF:
		/*
		 * Row-store leaf page: allocate memory to describe the page's
		 * contents with the initial allocation.
		 */
		size += alloc_entries * sizeof(WT_ROW);
		break;
	WT_ILLEGAL_VALUE(session);
	}

	WT_RET(__wt_calloc(session, 1, size, &page));

	page->type = type;
	page->read_gen = WT_READGEN_NOTSET;

	switch (type) {
	case WT_PAGE_COL_FIX:
		page->pg_fix_recno = recno;
		page->pg_fix_entries = alloc_entries;
		break;
	case WT_PAGE_COL_INT:
	case WT_PAGE_ROW_INT:
		page->pg_intl_recno = recno;

		/*
		 * Internal pages have an array of references to objects so they
		 * can split.  Allocate the array of references and optionally,
		 * the objects to which they point.
		 */
		WT_ERR(__wt_calloc(session, 1,
		    sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *),
		    &p));
		size +=
		    sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *);
		pindex = p;
		pindex->index = (WT_REF **)((WT_PAGE_INDEX *)p + 1);
		pindex->entries = alloc_entries;
		WT_INTL_INDEX_SET(page, pindex);
		if (alloc_refs)
			for (i = 0; i < pindex->entries; ++i) {
				WT_ERR(__wt_calloc_one(
				    session, &pindex->index[i]));
				size += sizeof(WT_REF);
			}
		if (0) {
err:			if ((pindex = WT_INTL_INDEX_COPY(page)) != NULL) {
				for (i = 0; i < pindex->entries; ++i)
					__wt_free(session, pindex->index[i]);
				__wt_free(session, pindex);
			}
			__wt_free(session, page);
			return (ret);
		}
		break;
	case WT_PAGE_COL_VAR:
		page->pg_var_recno = recno;
		page->pg_var_d = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE));
		page->pg_var_entries = alloc_entries;
		break;
	case WT_PAGE_ROW_LEAF:
		page->pg_row_d = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE));
		page->pg_row_entries = alloc_entries;
		break;
	WT_ILLEGAL_VALUE(session);
	}

	/* Increment the cache statistics. */
	__wt_cache_page_inmem_incr(session, page, size);
	(void)WT_ATOMIC_ADD8(cache->bytes_read, size);
	(void)WT_ATOMIC_ADD8(cache->pages_inmem, 1);

	*pagep = page;
	return (0);
}
Esempio n. 25
0
/*
 * __wt_curfile_create --
 *	Open a cursor for a given btree handle.
 */
int
__wt_curfile_create(WT_SESSION_IMPL *session,
    WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap,
    WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    __wt_cursor_get_key,	/* get-key */
	    __wt_cursor_get_value,	/* get-value */
	    __wt_cursor_set_key,	/* set-key */
	    __wt_cursor_set_value,	/* set-value */
	    __curfile_compare,		/* compare */
	    __curfile_equals,		/* equals */
	    __curfile_next,		/* next */
	    __curfile_prev,		/* prev */
	    __curfile_reset,		/* reset */
	    __curfile_search,		/* search */
	    __curfile_search_near,	/* search-near */
	    __curfile_insert,		/* insert */
	    __curfile_update,		/* update */
	    __curfile_remove,		/* remove */
	    __wt_cursor_reconfigure,	/* reconfigure */
	    __curfile_close);		/* close */
	WT_BTREE *btree;
	WT_CONFIG_ITEM cval;
	WT_CURSOR *cursor;
	WT_CURSOR_BTREE *cbt;
	WT_CURSOR_BULK *cbulk;
	WT_DECL_RET;
	size_t csize;

	WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0);

	cbt = NULL;

	btree = S2BT(session);
	WT_ASSERT(session, btree != NULL);

	csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE);
	WT_RET(__wt_calloc(session, 1, csize, &cbt));

	cursor = &cbt->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	cursor->internal_uri = btree->dhandle->name;
	cursor->key_format = btree->key_format;
	cursor->value_format = btree->value_format;
	cbt->btree = btree;

	if (bulk) {
		F_SET(cursor, WT_CURSTD_BULK);

		cbulk = (WT_CURSOR_BULK *)cbt;

		/* Optionally skip the validation of each bulk-loaded key. */
		WT_ERR(__wt_config_gets_def(
		    session, cfg, "skip_sort_check", 0, &cval));
		WT_ERR(__wt_curbulk_init(
		    session, cbulk, bitmap, cval.val == 0 ? 0 : 1));
	}

	/*
	 * random_retrieval
	 * Random retrieval cursors only support next, reset and close.
	 */
	WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
	if (cval.val != 0) {
		__wt_cursor_set_notsup(cursor);
		cursor->next = __curfile_next_random;
		cursor->reset = __curfile_reset;
	}

	/* Underlying btree initialization. */
	__wt_btcur_open(cbt);

	/* __wt_cursor_init is last so we don't have to clean up on error. */
	WT_ERR(__wt_cursor_init(
	    cursor, cursor->internal_uri, owner, cfg, cursorp));

	WT_STAT_FAST_CONN_INCR(session, cursor_create);
	WT_STAT_FAST_DATA_INCR(session, cursor_create);

	if (0) {
err:		__wt_free(session, cbt);
	}

	return (ret);
}
Esempio n. 26
0
/*
 * __las_page_instantiate --
 *	Instantiate lookaside update records in a recently read page.
 */
static int
__las_page_instantiate(WT_SESSION_IMPL *session,
    WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size)
{
	WT_CURSOR *cursor;
	WT_CURSOR_BTREE cbt;
	WT_DECL_ITEM(current_key);
	WT_DECL_ITEM(las_addr);
	WT_DECL_ITEM(las_key);
	WT_DECL_ITEM(las_value);
	WT_DECL_RET;
	WT_PAGE *page;
	WT_UPDATE *first_upd, *last_upd, *upd;
	size_t incr, total_incr;
	uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid;
	uint32_t las_id, upd_size, session_flags;
	int exact;
	const uint8_t *p;

	cursor = NULL;
	page = ref->page;
	first_upd = last_upd = upd = NULL;
	total_incr = 0;
	current_recno = recno = WT_RECNO_OOB;
	session_flags = 0;		/* [-Werror=maybe-uninitialized] */

	__wt_btcur_init(session, &cbt);
	__wt_btcur_open(&cbt);

	WT_ERR(__wt_scr_alloc(session, 0, &current_key));
	WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
	WT_ERR(__wt_scr_alloc(session, 0, &las_key));
	WT_ERR(__wt_scr_alloc(session, 0, &las_value));

	/* Open a lookaside table cursor. */
	WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));

	/*
	 * The lookaside records are in key and update order, that is, there
	 * will be a set of in-order updates for a key, then another set of
	 * in-order updates for a subsequent key. We process all of the updates
	 * for a key and then insert those updates into the page, then all the
	 * updates for the next key, and so on.
	 *
	 * Search for the block's unique prefix, stepping through any matching
	 * records.
	 */
	las_addr->data = addr;
	las_addr->size = addr_size;
	las_key->size = 0;
	cursor->set_key(
	    cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key);
	if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
		ret = cursor->next(cursor);
	for (; ret == 0; ret = cursor->next(cursor)) {
		WT_ERR(cursor->get_key(cursor,
		    &las_id, las_addr, &las_counter, &las_txnid, las_key));

		/*
		 * Confirm the search using the unique prefix; if not a match,
		 * we're done searching for records for this page.
		 */
		if (las_id != read_id ||
		    las_addr->size != addr_size ||
		    memcmp(las_addr->data, addr, addr_size) != 0)
			break;

		/*
		 * If the on-page value has become globally visible, this record
		 * is no longer needed.
		 */
		if (__wt_txn_visible_all(session, las_txnid))
			continue;

		/* Allocate the WT_UPDATE structure. */
		WT_ERR(cursor->get_value(
		    cursor, &upd_txnid, &upd_size, las_value));
		WT_ERR(__wt_update_alloc(session,
		    (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value,
		    &upd, &incr));
		total_incr += incr;
		upd->txnid = upd_txnid;

		switch (page->type) {
		case WT_PAGE_COL_FIX:
		case WT_PAGE_COL_VAR:
			p = las_key->data;
			WT_ERR(__wt_vunpack_uint(&p, 0, &recno));
			if (current_recno == recno)
				break;
			WT_ASSERT(session, current_recno < recno);

			if (first_upd != NULL) {
				WT_ERR(__col_instantiate(session,
				    current_recno, ref, &cbt, first_upd));
				first_upd = NULL;
			}
			current_recno = recno;
			break;
		case WT_PAGE_ROW_LEAF:
			if (current_key->size == las_key->size &&
			    memcmp(current_key->data,
			    las_key->data, las_key->size) == 0)
				break;

			if (first_upd != NULL) {
				WT_ERR(__row_instantiate(session,
				    current_key, ref, &cbt, first_upd));
				first_upd = NULL;
			}
			WT_ERR(__wt_buf_set(session,
			    current_key, las_key->data, las_key->size));
			break;
		WT_ILLEGAL_VALUE_ERR(session);
		}

		/* Append the latest update to the list. */
		if (first_upd == NULL)
			first_upd = last_upd = upd;
		else {
			last_upd->next = upd;
			last_upd = upd;
		}
		upd = NULL;
	}
	WT_ERR_NOTFOUND_OK(ret);

	/* Insert the last set of updates, if any. */
	if (first_upd != NULL)
		switch (page->type) {
		case WT_PAGE_COL_FIX:
		case WT_PAGE_COL_VAR:
			WT_ERR(__col_instantiate(session,
			    current_recno, ref, &cbt, first_upd));
			first_upd = NULL;
			break;
		case WT_PAGE_ROW_LEAF:
			WT_ERR(__row_instantiate(session,
			    current_key, ref, &cbt, first_upd));
			first_upd = NULL;
			break;
		WT_ILLEGAL_VALUE_ERR(session);
		}

	/* Discard the cursor. */
	WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags));

	if (total_incr != 0) {
		__wt_cache_page_inmem_incr(session, page, total_incr);

		/*
		 * We've modified/dirtied the page, but that's not necessary and
		 * if we keep the page clean, it's easier to evict. We leave the
		 * lookaside table updates in place, so if we evict this page
		 * without dirtying it, any future instantiation of it will find
		 * the records it needs. If the page is dirtied before eviction,
		 * then we'll write any needed lookaside table records for the
		 * new location of the page.
		 */
		__wt_page_modify_clear(session, page);
	}

err:	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
	WT_TRET(__wt_btcur_close(&cbt, true));

	/*
	 * On error, upd points to a single unlinked WT_UPDATE structure,
	 * first_upd points to a list.
	 */
	if (upd != NULL)
		__wt_free(session, upd);
	if (first_upd != NULL)
		__wt_free_update_list(session, first_upd);

	__wt_scr_free(session, &current_key);
	__wt_scr_free(session, &las_addr);
	__wt_scr_free(session, &las_key);
	__wt_scr_free(session, &las_value);

	return (ret);
}
Esempio n. 27
0
/*
 * __wt_session_compact --
 *	WT_SESSION.compact method.
 */
int
__wt_session_compact(
    WT_SESSION *wt_session, const char *uri, const char *config)
{
	WT_COMPACT_STATE compact;
	WT_CONFIG_ITEM cval;
	WT_DATA_SOURCE *dsrc;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	u_int i;
	bool ignore_cache_size_set;

	ignore_cache_size_set = false;

	session = (WT_SESSION_IMPL *)wt_session;
	SESSION_API_CALL(session, compact, config, cfg);

	/*
	 * The compaction thread should not block when the cache is full: it is
	 * holding locks blocking checkpoints and once the cache is full, it can
	 * spend a long time doing eviction.
	 */
	if (!F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE)) {
		ignore_cache_size_set = true;
		F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE);
	}

	/* In-memory ignores compaction operations. */
	if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
		goto err;

	/*
	 * Non-LSM object compaction requires checkpoints, which are impossible
	 * in transactional contexts. Disallow in all contexts (there's no
	 * reason for LSM to allow this, possible or not), and check now so the
	 * error message isn't confusing.
	 */
	WT_ERR(__wt_txn_context_check(session, false));

	/* Disallow objects in the WiredTiger name space. */
	WT_ERR(__wt_str_name_check(session, uri));

	if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
	    !WT_PREFIX_MATCH(uri, "file:") &&
	    !WT_PREFIX_MATCH(uri, "index:") &&
	    !WT_PREFIX_MATCH(uri, "lsm:") &&
	    !WT_PREFIX_MATCH(uri, "table:")) {
		if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
			ret = dsrc->compact == NULL ?
			    __wt_object_unsupported(session, uri) :
			    dsrc->compact(
			    dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg);
		else
			ret = __wt_bad_object_type(session, uri);
		goto err;
	}

	/* Setup the session handle's compaction state structure. */
	memset(&compact, 0, sizeof(WT_COMPACT_STATE));
	session->compact = &compact;

	/* Compaction can be time-limited. */
	WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval));
	session->compact->max_time = (uint64_t)cval.val;
	__wt_epoch(session, &session->compact->begin);

	/* Find the types of data sources being compacted. */
	WT_WITH_SCHEMA_LOCK(session,
	    ret = __wt_schema_worker(session, uri,
	    __compact_handle_append, __compact_uri_analyze, cfg, 0));
	WT_ERR(ret);

	if (session->compact->lsm_count != 0)
		WT_ERR(__wt_schema_worker(
		    session, uri, NULL, __wt_lsm_compact, cfg, 0));
	if (session->compact->file_count != 0)
		WT_ERR(__compact_worker(session));

err:	session->compact = NULL;

	for (i = 0; i < session->op_handle_next; ++i) {
		WT_WITH_DHANDLE(session, session->op_handle[i],
		    WT_TRET(__compact_end(session)));
		WT_WITH_DHANDLE(session, session->op_handle[i],
		    WT_TRET(__wt_session_release_dhandle(session)));
	}

	__wt_free(session, session->op_handle);
	session->op_handle_allocated = session->op_handle_next = 0;

	/*
	 * Release common session resources (for example, checkpoint may acquire
	 * significant reconciliation structures/memory).
	 */
	WT_TRET(__wt_session_release_resources(session));

	if (ignore_cache_size_set)
		F_CLR(session, WT_SESSION_IGNORE_CACHE_SIZE);

	if (ret != 0)
		WT_STAT_CONN_INCR(session, session_table_compact_fail);
	else
		WT_STAT_CONN_INCR(session, session_table_compact_success);
	API_END_RET_NOTFOUND_MAP(session, ret);
}
Esempio n. 28
0
/*
 * __wt_col_modify --
 *	Column-store delete, insert, and update.
 */
int
__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_INSERT *ins, *ins_copy;
	WT_INSERT_HEAD **inshead, *new_inshead, **new_inslist;
	WT_ITEM *value, _value;
	WT_PAGE *page;
	WT_UPDATE *old_upd, *upd, *upd_obsolete;
	size_t ins_size, new_inshead_size, new_inslist_size, upd_size;
	uint64_t recno;
	u_int skipdepth;
	int i, logged;

	btree = cbt->btree;
	page = cbt->page;
	recno = cbt->iface.recno;
	logged = 0;

	WT_ASSERT(session, op != 1);

	switch (op) {
	case 2:						/* Remove */
		if (btree->type == BTREE_COL_FIX) {
			value = &_value;
			value->data = "";
			value->size = 1;
		} else
			value = NULL;
		break;
	case 3:						/* Insert/Update */
	default:
		value = &cbt->iface.value;

		/*
		 * There's some chance the application specified a record past
		 * the last record on the page.  If that's the case, and we're
		 * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
		 * append list, not the update list.
		 */
		if (recno == 0 || recno > __col_last_recno(page))
			op = 1;
		break;
	}

	/* If we don't yet have a modify structure, we'll need one. */
	WT_RET(__wt_page_modify_init(session, page));

	ins = NULL;
	new_inshead = NULL;
	new_inslist = NULL;
	upd = NULL;

	/*
	 * Delete, insert or update a column-store entry.
	 *
	 * If modifying a previously modified record, create a new WT_UPDATE
	 * entry and have a serialized function link it into an existing
	 * WT_INSERT entry's WT_UPDATE list.
	 *
	 * Else, allocate an insert array as necessary, build a WT_INSERT and
	 * WT_UPDATE structure pair, and call a serialized function to insert
	 * the WT_INSERT structure.
	 */
	if (cbt->compare == 0 && cbt->ins != NULL) {
		/* Make sure the update can proceed. */
		WT_ERR(
		    __wt_update_check(session, page, old_upd = cbt->ins->upd));

		/* Allocate the WT_UPDATE structure and transaction ID. */
		WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
		WT_ERR(__wt_txn_modify(session, &upd->txnid));
		logged = 1;

		/* Serialize the update. */
		WT_ERR(__wt_update_serial(session, page,
		    cbt->write_gen, &cbt->ins->upd, old_upd,
		    NULL, 0, &upd, upd_size, &upd_obsolete));

		/* Discard any obsolete WT_UPDATE structures. */
		if (upd_obsolete != NULL)
			__wt_update_obsolete_free(session, page, upd_obsolete);
	} else {
		/* Make sure the update can proceed. */
		WT_ERR(__wt_update_check(session, page, NULL));

		/* There may be no insert list, allocate as necessary. */
		new_inshead_size = new_inslist_size = 0;
		if (op == 1) {
			if (page->modify->append == NULL) {
				new_inslist_size = 1 * sizeof(WT_INSERT_HEAD *);
				WT_ERR(
				    __wt_calloc_def(session, 1, &new_inslist));
				inshead = &new_inslist[0];
			} else
				inshead = &page->modify->append[0];
			cbt->ins_head = *inshead;
		} else if (page->type == WT_PAGE_COL_FIX) {
			if (page->modify->update == NULL) {
				new_inslist_size = 1 * sizeof(WT_INSERT_HEAD *);
				WT_ERR(
				    __wt_calloc_def(session, 1, &new_inslist));
				inshead = &new_inslist[0];
			} else
				inshead = &page->modify->update[0];
		} else {
			if (page->modify->update == NULL) {
				new_inslist_size =
				    page->entries * sizeof(WT_INSERT_HEAD *);
				WT_ERR(__wt_calloc_def(
				    session, page->entries, &new_inslist));
				inshead = &new_inslist[cbt->slot];
			} else
				inshead = &page->modify->update[cbt->slot];
		}

		/* There may be no WT_INSERT list, allocate as necessary. */
		if (*inshead == NULL) {
			new_inshead_size = sizeof(WT_INSERT_HEAD);
			WT_ERR(__wt_calloc_def(session, 1, &new_inshead));
			for (i = 0; i < WT_SKIP_MAXDEPTH; i++) {
				cbt->ins_stack[i] = &new_inshead->head[i];
				cbt->next_stack[i] = NULL;
			}
			cbt->ins_head = new_inshead;
		}

		/* Choose a skiplist depth for this insert. */
		skipdepth = __wt_skip_choose_depth();

		/*
		 * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
		 * update the cursor to reference it.
		 */
		WT_ERR(__col_insert_alloc(
		    session, recno, skipdepth, &ins, &ins_size));
		WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
		WT_ERR(__wt_txn_modify(session, &upd->txnid));
		logged = 1;
		ins->upd = upd;
		ins_size += upd_size;
		cbt->ins = ins;

		/* Insert or append the WT_INSERT structure. */
		if (op == 1) {
			/*
			 * The serialized function clears ins: take a copy of
			 * the pointer so we can look up the record number.
			 */
			ins_copy = ins;

			WT_ERR(__wt_col_append_serial(session,
			    page, cbt->write_gen, inshead,
			    cbt->ins_stack, cbt->next_stack,
			    &new_inslist, new_inslist_size,
			    &new_inshead, new_inshead_size,
			    &ins, ins_size, skipdepth));

			/* Put the new recno into the cursor. */
			cbt->recno = WT_INSERT_RECNO(ins_copy);
		} else
			WT_ERR(__wt_insert_serial(session,
			    page, cbt->write_gen, inshead,
			    cbt->ins_stack, cbt->next_stack,
			    &new_inslist, new_inslist_size,
			    &new_inshead, new_inshead_size,
			    &ins, ins_size, skipdepth));
	}

	if (0) {
err:		/*
		 * Remove the update from the current transaction, so we don't
		 * try to modify it on rollback.
		 */
		if (logged)
			__wt_txn_unmodify(session);
		__wt_free(session, ins);
		__wt_free(session, upd);
	}

	__wt_free(session, new_inslist);
	__wt_free(session, new_inshead);

	return (ret);
}
Esempio n. 29
0
/*
 * __rec_page_dirty_update --
 *	Update a dirty page's reference on eviction.
 */
static int
__rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_ADDR *addr;
	WT_PAGE_MODIFY *mod;
	WT_REF *parent_ref;

	mod = page->modify;
	parent_ref = page->ref;

	switch (F_ISSET(mod, WT_PM_REC_MASK)) {
	case WT_PM_REC_EMPTY:				/* Page is empty */
		if (parent_ref->addr != NULL &&
		    __wt_off_page(page->parent, parent_ref->addr)) {
			__wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr);
			__wt_free(session, parent_ref->addr);
		}

		/*
		 * Update the parent to reference an empty page.
		 *
		 * Set the transaction ID to WT_TXN_NONE because the fact that
		 * reconciliation left the page "empty" means there's no older
		 * transaction in the system that might need to see an earlier
		 * version of the page.  It isn't necessary (WT_TXN_NONE is 0),
		 * but it's the right thing to do.
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		parent_ref->page = NULL;
		parent_ref->addr = NULL;
		parent_ref->txnid = WT_TXN_NONE;
		WT_PUBLISH(parent_ref->state, WT_REF_DELETED);
		break;
	case WT_PM_REC_REPLACE: 			/* 1-for-1 page swap */
		if (parent_ref->addr != NULL &&
		    __wt_off_page(page->parent, parent_ref->addr)) {
			__wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr);
			__wt_free(session, parent_ref->addr);
		}

		/*
		 * Update the parent to reference the replacement page.
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr));
		*addr = mod->u.replace;
		mod->u.replace.addr = NULL;
		mod->u.replace.size = 0;

		parent_ref->page = NULL;
		parent_ref->addr = addr;
		WT_PUBLISH(parent_ref->state, WT_REF_DISK);
		break;
	case WT_PM_REC_SPLIT:				/* Page split */
		/*
		 * Update the parent to reference new internal page(s).
		 *
		 * Publish: a barrier to ensure the structure fields are set
		 * before the state change makes the page available to readers.
		 */
		parent_ref->page = mod->u.split;
		WT_PUBLISH(parent_ref->state, WT_REF_MEM);

		/* Clear the reference else discarding the page will free it. */
		mod->u.split = NULL;
		F_CLR(mod, WT_PM_REC_SPLIT);
		break;
	WT_ILLEGAL_VALUE(session);
	}

	return (0);
}
Esempio n. 30
0
/*
 * __wt_col_modify --
 *	Column-store delete, insert, and update.
 */
int
__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
    uint64_t recno, const WT_ITEM *value,
    WT_UPDATE *upd_arg, u_int modify_type, bool exclusive)
{
	static const WT_ITEM col_fix_remove = { "", 1, NULL, 0, 0 };
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_INSERT *ins;
	WT_INSERT_HEAD *ins_head, **ins_headp;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_UPDATE *old_upd, *upd;
	size_t ins_size, upd_size;
	u_int i, skipdepth;
	bool append, logged;

	btree = cbt->btree;
	ins = NULL;
	page = cbt->ref->page;
	upd = upd_arg;
	append = logged = false;

	if (upd_arg == NULL) {
		if (modify_type == WT_UPDATE_RESERVE ||
		    modify_type == WT_UPDATE_TOMBSTONE) {
			/*
			 * Fixed-size column-store doesn't have on-page deleted
			 * values, it's a nul byte.
			 */
			if (modify_type == WT_UPDATE_TOMBSTONE &&
			    btree->type == BTREE_COL_FIX) {
				modify_type = WT_UPDATE_STANDARD;
				value = &col_fix_remove;
			}
		}

		/*
		 * There's a chance the application specified a record past the
		 * last record on the page. If that's the case and we're
		 * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
		 * append list, not the update list. Also, an out-of-band recno
		 * implies an append operation, we're allocating a new row.
		 * Ignore any information obtained from the search.
		 */
		WT_ASSERT(session, recno != WT_RECNO_OOB || cbt->compare != 0);
		if (cbt->compare != 0 &&
		    (recno == WT_RECNO_OOB ||
		    recno > (btree->type == BTREE_COL_VAR ?
		    __col_var_last_recno(cbt->ref) :
		    __col_fix_last_recno(cbt->ref)))) {
			append = true;
			cbt->ins = NULL;
			cbt->ins_head = NULL;
		}
	}

	/* We're going to modify the page, we should have loaded history. */
	WT_ASSERT(session, cbt->ref->state != WT_REF_LIMBO);

	/* If we don't yet have a modify structure, we'll need one. */
	WT_RET(__wt_page_modify_init(session, page));
	mod = page->modify;

	/*
	 * If modifying a record not previously modified, but which is in the
	 * same update slot as a previously modified record, cursor.ins will
	 * not be set because there's no list of update records for this recno,
	 * but cursor.ins_head will be set to point to the correct update slot.
	 * Acquire the necessary insert information, then create a new update
	 * entry and link it into the existing list. We get here if a page has
	 * a single cell representing multiple records (the records have the
	 * same value), and then a record in the cell is updated or removed,
	 * creating the update list for the cell, and then a cursor iterates
	 * into that same cell to update/remove a different record. We find the
	 * correct slot in the update array, but we don't find an update list
	 * (because it doesn't exist), and don't have the information we need
	 * to do the insert. Normally, we wouldn't care (we could fail and do
	 * a search for the record which would configure everything for the
	 * insert), but range truncation does this pattern for every record in
	 * the cell, and the performance is terrible. For that reason, catch it
	 * here.
	 */
	if (cbt->ins == NULL && cbt->ins_head != NULL) {
		cbt->ins = __col_insert_search(
		    cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno);
		if (cbt->ins != NULL) {
			if (WT_INSERT_RECNO(cbt->ins) == recno)
				cbt->compare = 0;
			else {
				/*
				 * The test below is for cursor.compare set to 0
				 * and cursor.ins set: cursor.compare wasn't set
				 * by the search we just did, and has an unknown
				 * value. Clear cursor.ins to avoid the test.
				 */
				cbt->ins = NULL;
			}
		}
	}

	/*
	 * Delete, insert or update a column-store entry.
	 *
	 * If modifying a previously modified record, cursor.ins will be set to
	 * point to the correct update list. Create a new update entry and link
	 * it into the existing list.
	 *
	 * Else, allocate an insert array as necessary, build an insert/update
	 * structure pair, and link it into place.
	 */
	if (cbt->compare == 0 && cbt->ins != NULL) {
		/*
		 * If we are restoring updates that couldn't be evicted, the
		 * key must not exist on the new page.
		 */
		WT_ASSERT(session, upd_arg == NULL);

		/* Make sure the update can proceed. */
		WT_ERR(__wt_txn_update_check(session, old_upd = cbt->ins->upd));

		/* Allocate a WT_UPDATE structure and transaction ID. */
		WT_ERR(__wt_update_alloc(session,
		    value, &upd, &upd_size, modify_type));
		WT_ERR(__wt_txn_modify(session, upd));
		logged = true;

		/* Avoid a data copy in WT_CURSOR.update. */
		cbt->modify_update = upd;

		/*
		 * Point the new WT_UPDATE item to the next element in the list.
		 * If we get it right, the serialization function lock acts as
		 * our memory barrier to flush this write.
		 */
		upd->next = old_upd;

		/* Serialize the update. */
		WT_ERR(__wt_update_serial(
		    session, page, &cbt->ins->upd, &upd, upd_size, false));
	} else {
		/* Allocate the append/update list reference as necessary. */
		if (append) {
			WT_PAGE_ALLOC_AND_SWAP(session,
			    page, mod->mod_col_append, ins_headp, 1);
			ins_headp = &mod->mod_col_append[0];
		} else if (page->type == WT_PAGE_COL_FIX) {
			WT_PAGE_ALLOC_AND_SWAP(session,
			    page, mod->mod_col_update, ins_headp, 1);
			ins_headp = &mod->mod_col_update[0];
		} else {
			WT_PAGE_ALLOC_AND_SWAP(session, page,
			    mod->mod_col_update, ins_headp, page->entries);
			ins_headp = &mod->mod_col_update[cbt->slot];
		}

		/* Allocate the WT_INSERT_HEAD structure as necessary. */
		WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
		ins_head = *ins_headp;

		/* Choose a skiplist depth for this insert. */
		skipdepth = __wt_skip_choose_depth(session);

		/*
		 * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
		 * update the cursor to reference it (the WT_INSERT_HEAD might
		 * be allocated, the WT_INSERT was allocated).
		 */
		WT_ERR(__col_insert_alloc(
		    session, recno, skipdepth, &ins, &ins_size));
		cbt->ins_head = ins_head;
		cbt->ins = ins;

		/*
		 * Check for insert split and checkpoint races in column-store:
		 * it's easy (as opposed to in row-store) and a difficult bug to
		 * otherwise diagnose.
		 */
		WT_ASSERT(session, mod->mod_col_split_recno == WT_RECNO_OOB ||
		    (recno != WT_RECNO_OOB &&
		    mod->mod_col_split_recno > recno));

		if (upd_arg == NULL) {
			WT_ERR(__wt_update_alloc(session,
			    value, &upd, &upd_size, modify_type));
			WT_ERR(__wt_txn_modify(session, upd));
			logged = true;

			/* Avoid a data copy in WT_CURSOR.update. */
			cbt->modify_update = upd;
		} else
			upd_size = __wt_update_list_memsize(upd);
		ins->upd = upd;
		ins_size += upd_size;

		/*
		 * If there was no insert list during the search, or there was
		 * no search because the record number has not been allocated
		 * yet, the cursor's information cannot be correct, search
		 * couldn't have initialized it.
		 *
		 * Otherwise, point the new WT_INSERT item's skiplist to the
		 * next elements in the insert list (which we will check are
		 * still valid inside the serialization function).
		 *
		 * The serial mutex acts as our memory barrier to flush these
		 * writes before inserting them into the list.
		 */
		if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB)
			for (i = 0; i < skipdepth; i++) {
				cbt->ins_stack[i] = &ins_head->head[i];
				ins->next[i] = cbt->next_stack[i] = NULL;
			}
		else
			for (i = 0; i < skipdepth; i++)
				ins->next[i] = cbt->next_stack[i];

		/* Append or insert the WT_INSERT structure. */
		if (append)
			WT_ERR(__wt_col_append_serial(
			    session, page, cbt->ins_head, cbt->ins_stack,
			    &ins, ins_size, &cbt->recno, skipdepth, exclusive));
		else
			WT_ERR(__wt_insert_serial(
			    session, page, cbt->ins_head, cbt->ins_stack,
			    &ins, ins_size, skipdepth, exclusive));

	}

	/* If the update was successful, add it to the in-memory log. */
	if (logged && modify_type != WT_UPDATE_RESERVE) {
		WT_ERR(__wt_txn_log_op(session, cbt));

		/*
		 * In case of append, the recno (key) for the value is assigned
		 * now. Set the recno in the transaction operation to be used
		 * incase this transaction is prepared to retrieve the update
		 * corresponding to this operation.
		 */
		__wt_txn_op_set_recno(session, cbt->recno);
	}

	if (0) {
err:		/*
		 * Remove the update from the current transaction, so we don't
		 * try to modify it on rollback.
		 */
		if (logged)
			__wt_txn_unmodify(session);
		__wt_free(session, ins);
		if (upd_arg == NULL)
			__wt_free(session, upd);
	}

	return (ret);
}