コード例 #1
0
/*
 * __wt_bm_read --
 *	Map or read address cookie referenced block into a buffer.
 */
int
__wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session,
    WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
{
	WT_BLOCK *block;
	WT_DECL_RET;
	WT_FILE_HANDLE *handle;
	wt_off_t offset;
	uint32_t checksum, size;
	bool mapped;

	WT_UNUSED(addr_size);
	block = bm->block;

	/* Crack the cookie. */
	WT_RET(
	    __wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum));

	/*
	 * Map the block if it's possible.
	 */
	handle = block->fh->handle;
	mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen;
	if (mapped && handle->fh_map_preload != NULL) {
		buf->data = (uint8_t *)bm->map + offset;
		buf->size = size;
		ret = handle->fh_map_preload(handle, (WT_SESSION *)session,
		    buf->data, buf->size,bm->mapped_cookie);

		WT_STAT_CONN_INCR(session, block_map_read);
		WT_STAT_CONN_INCRV(session, block_byte_map_read, size);
		return (ret);
	}

#ifdef HAVE_DIAGNOSTIC
	/*
	 * In diagnostic mode, verify the block we're about to read isn't on
	 * the available list, or for live systems, the discard list.
	 */
	WT_RET(__wt_block_misplaced(session,
	    block, "read", offset, size, bm->is_live, __func__, __LINE__));
#endif
	/* Read the block. */
	__wt_capacity_throttle(session, size, WT_THROTTLE_READ);
	WT_RET(
	    __wt_block_read_off(session, block, buf, offset, size, checksum));

	/* Optionally discard blocks from the system's buffer cache. */
	WT_RET(__wt_block_discard(session, block, (size_t)size));

	return (0);
}
コード例 #2
0
ファイル: bt_walk.c プロジェクト: adityavs/wiredtiger
/*
 * __ref_descend_prev --
 *	Descend the tree one level, during a previous-cursor walk.
 */
static inline void
__ref_descend_prev(
    WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp)
{
	WT_PAGE_INDEX *pindex;
	uint64_t yield_count;

	/*
	 * We're passed a child page into which we're descending, and on which
	 * we have a hazard pointer.
	 */
	for (yield_count = 0;; yield_count++, __wt_yield()) {
		/*
		 * There's a split race when a cursor moving backwards through
		 * the tree descends the tree. If we're splitting an internal
		 * page into its parent, we move the WT_REF structures and
		 * update the parent's page index before updating the split
		 * page's page index, and it's not an atomic update. A thread
		 * can read the parent page's replacement page index and then
		 * read the split page's original index.
		 *
		 * This can create a race for previous-cursor movements.
		 *
		 * For example, imagine an internal page with 3 child pages,
		 * with the namespaces a-f, g-h and i-j; the first child page
		 * splits. The parent starts out with the following page-index:
		 *
		 *	| ... | a | g | i | ... |
		 *
		 * The split page starts out with the following page-index:
		 *
		 *	| a | b | c | d | e | f |
		 *
		 * The first step is to move the c-f ranges into a new subtree,
		 * so, for example we might have two new internal pages 'c' and
		 * 'e', where the new 'c' page references the c-d namespace and
		 * the new 'e' page references the e-f namespace. The top of the
		 * subtree references the parent page, but until the parent's
		 * page index is updated, any threads in the subtree won't be
		 * able to ascend out of the subtree. However, once the parent
		 * page's page index is updated to this:
		 *
		 *	| ... | a | c | e | g | i | ... |
		 *
		 * threads in the subtree can ascend into the parent. Imagine a
		 * cursor in the c-d part of the namespace that ascends to the
		 * parent's 'c' slot. It would then decrement to the slot before
		 * the 'c' slot, the 'a' slot.
		 *
		 * The previous-cursor movement selects the last slot in the 'a'
		 * page; if the split page's page-index hasn't been updated yet,
		 * it will select the 'f' slot, which is incorrect. Once the
		 * split page's page index is updated to this:
		 *
		 *	| a | b |
		 *
		 * the previous-cursor movement will select the 'b' slot, which
		 * is correct.
		 *
		 * This function takes an argument which is the internal page
		 * from which we're descending. If the last slot on the page no
		 * longer points to the current page as its "home", the page is
		 * being split and part of its namespace moved. We have the
		 * correct page and we don't have to move, all we have to do is
		 * wait until the split page's page index is updated.
		 */
		WT_INTL_INDEX_GET(session, ref->page, pindex);
		if (pindex->index[pindex->entries - 1]->home == ref->page)
			break;
	}
	*pindexp = pindex;
	WT_STAT_CONN_INCRV(session, tree_descend_blocked, yield_count);
}
コード例 #3
0
ファイル: bt_walk.c プロジェクト: adityavs/wiredtiger
/*
 * __ref_index_slot --
 *      Return the page's index and slot for a reference.
 */
static inline void
__ref_index_slot(WT_SESSION_IMPL *session,
    WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
{
	WT_PAGE_INDEX *pindex;
	WT_REF **start, **stop, **p, **t;
	uint64_t sleep_count, yield_count;
	uint32_t entries, slot;

	/*
	 * If we don't find our reference, the page split and our home
	 * pointer references the wrong page. When internal pages
	 * split, their WT_REF structure home values are updated; yield
	 * and wait for that to happen.
	 */
	for (sleep_count = yield_count = 0;;) {
		/*
		 * Copy the parent page's index value: the page can split at
		 * any time, but the index's value is always valid, even if
		 * it's not up-to-date.
		 */
		WT_INTL_INDEX_GET(session, ref->home, pindex);
		entries = pindex->entries;

		/*
		 * Use the page's reference hint: it should be correct unless
		 * there was a split or delete in the parent before our slot.
		 * If the hint is wrong, it can be either too big or too small,
		 * but often only by a small amount.  Search up and down the
		 * index starting from the hint.
		 *
		 * It's not an error for the reference hint to be wrong, it
		 * just means the first retrieval (which sets the hint for
		 * subsequent retrievals), is slower.
		 */
		slot = ref->pindex_hint;
		if (slot >= entries)
			slot = entries - 1;
		if (pindex->index[slot] == ref)
			goto found;
		for (start = &pindex->index[0],
		    stop = &pindex->index[entries - 1],
		    p = t = &pindex->index[slot];
		    p > start || t < stop;) {
			if (p > start && *--p == ref) {
				slot = (uint32_t)(p - start);
				goto found;
			}
			if (t < stop && *++t == ref) {
				slot = (uint32_t)(t - start);
				goto found;
			}
		}
		/*
		 * We failed to get the page index and slot reference, yield
		 * before retrying, and if we've yielded enough times, start
		 * sleeping so we don't burn CPU to no purpose.
		 */
		__wt_ref_state_yield_sleep(&yield_count, &sleep_count);
		WT_STAT_CONN_INCRV(session, page_index_slot_ref_blocked,
		    sleep_count);
	}

found:	WT_ASSERT(session, pindex->index[slot] == ref);
	*pindexp = pindex;
	*slotp = slot;
}
コード例 #4
0
ファイル: block_read.c プロジェクト: DINKIN/mongo
/*
 * __wt_block_read_off --
 *	Read an addr/size pair referenced block into a buffer.
 */
int
__wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t checksum)
{
	WT_BLOCK_HEADER *blk, swap;
	size_t bufsize;
	uint32_t page_checksum;

	__wt_verbose(session, WT_VERB_READ,
	    "off %" PRIuMAX ", size %" PRIu32 ", checksum %" PRIu32,
	    (uintmax_t)offset, size, checksum);

	WT_STAT_CONN_INCR(session, block_read);
	WT_STAT_CONN_INCRV(session, block_byte_read, size);

	/*
	 * Grow the buffer as necessary and read the block.  Buffers should be
	 * aligned for reading, but there are lots of buffers (for example, file
	 * cursors have two buffers each, key and value), and it's difficult to
	 * be sure we've found all of them.  If the buffer isn't aligned, it's
	 * an easy fix: set the flag and guarantee we reallocate it.  (Most of
	 * the time on reads, the buffer memory has not yet been allocated, so
	 * we're not adding any additional processing time.)
	 */
	if (F_ISSET(buf, WT_ITEM_ALIGNED))
		bufsize = size;
	else {
		F_SET(buf, WT_ITEM_ALIGNED);
		bufsize = WT_MAX(size, buf->memsize + 10);
	}
	WT_RET(__wt_buf_init(session, buf, bufsize));
	WT_RET(__wt_read(session, block->fh, offset, size, buf->mem));
	buf->size = size;

	/*
	 * We incrementally read through the structure before doing a checksum,
	 * do little- to big-endian handling early on, and then select from the
	 * original or swapped structure as needed.
	 */
	blk = WT_BLOCK_HEADER_REF(buf->mem);
	__wt_block_header_byteswap_copy(blk, &swap);
	if (swap.checksum == checksum) {
		blk->checksum = 0;
		page_checksum = __wt_checksum(buf->mem,
		    F_ISSET(&swap, WT_BLOCK_DATA_CKSUM) ?
		    size : WT_BLOCK_COMPRESS_SKIP);
		if (page_checksum == checksum) {
			/*
			 * Swap the page-header as needed; this doesn't belong
			 * here, but it's the best place to catch all callers.
			 */
			__wt_page_header_byteswap(buf->mem);
			return (0);
		}

		if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
			__wt_errx(session,
			    "read checksum error for %" PRIu32 "B block at "
			    "offset %" PRIuMAX ": calculated block checksum "
			    "of %" PRIu32 " doesn't match expected checksum "
			    "of %" PRIu32,
			    size, (uintmax_t)offset, page_checksum, checksum);
	} else
		if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
			__wt_errx(session,
			    "read checksum error for %" PRIu32 "B block at "
			    "offset %" PRIuMAX ": block header checksum "
			    "of %" PRIu32 " doesn't match expected checksum "
			    "of %" PRIu32,
			    size, (uintmax_t)offset, swap.checksum, checksum);

	/* Panic if a checksum fails during an ordinary read. */
	return (block->verify ||
	    F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE) ?
	    WT_ERROR : __wt_illegal_value(session, block->name));
}
コード例 #5
0
ファイル: lsm_merge.c プロジェクト: Machyne/mongo
/*
 * __wt_lsm_merge --
 *	Merge a set of chunks of an LSM tree.
 */
int
__wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
{
	WT_BLOOM *bloom;
	WT_CURSOR *dest, *src;
	WT_DECL_RET;
	WT_ITEM key, value;
	WT_LSM_CHUNK *chunk;
	uint32_t generation;
	uint64_t insert_count, record_count;
	u_int dest_id, end_chunk, i, nchunks, start_chunk, start_id, verb;
	int tret;
	bool created_chunk, create_bloom, locked, in_sync;
	const char *cfg[3];
	const char *drop_cfg[] =
	    { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL };

	bloom = NULL;
	chunk = NULL;
	dest = src = NULL;
	created_chunk = create_bloom = locked = in_sync = false;

	/* Fast path if it's obvious no merges could be done. */
	if (lsm_tree->nchunks < lsm_tree->merge_min &&
	    lsm_tree->merge_aggressiveness < WT_LSM_AGGRESSIVE_THRESHOLD)
		return (WT_NOTFOUND);

	/*
	 * Use the lsm_tree lock to read the chunks (so no switches occur), but
	 * avoid holding it while the merge is in progress: that may take a
	 * long time.
	 */
	__wt_lsm_tree_writelock(session, lsm_tree);
	locked = true;

	WT_ERR(__lsm_merge_span(session,
	    lsm_tree, id, &start_chunk, &end_chunk, &record_count));
	nchunks = (end_chunk + 1) - start_chunk;

	WT_ASSERT(session, nchunks > 0);
	start_id = lsm_tree->chunk[start_chunk]->id;

	/* Find the merge generation. */
	for (generation = 0, i = 0; i < nchunks; i++)
		generation = WT_MAX(generation,
		    lsm_tree->chunk[start_chunk + i]->generation + 1);

	__wt_lsm_tree_writeunlock(session, lsm_tree);
	locked = false;

	/* Allocate an ID for the merge. */
	dest_id = __wt_atomic_add32(&lsm_tree->last, 1);

	/*
	 * We only want to do the chunk loop if we're running with verbose,
	 * so we wrap these statements in the conditional.  Avoid the loop
	 * in the normal path.
	 */
	if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) {
		__wt_verbose(session, WT_VERB_LSM,
		    "Merging %s chunks %u-%u into %u (%" PRIu64 " records)"
		    ", generation %" PRIu32,
		    lsm_tree->name,
		    start_chunk, end_chunk, dest_id, record_count, generation);
		for (verb = start_chunk; verb < end_chunk + 1; verb++)
			__wt_verbose(session, WT_VERB_LSM,
			    "Merging %s: Chunk[%u] id %" PRIu32
			    ", gen: %" PRIu32
			    ", size: %" PRIu64 ", records: %" PRIu64,
			    lsm_tree->name, verb, lsm_tree->chunk[verb]->id,
			    lsm_tree->chunk[verb]->generation,
			    lsm_tree->chunk[verb]->size,
			    lsm_tree->chunk[verb]->count);
	}

	WT_ERR(__wt_calloc_one(session, &chunk));
	created_chunk = true;
	chunk->id = dest_id;

	if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) &&
	    (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) ||
	    start_chunk > 0) && record_count > 0)
		create_bloom = true;

	/*
	 * Special setup for the merge cursor:
	 * first, reset to open the dependent cursors;
	 * then restrict the cursor to a specific number of chunks;
	 * then set MERGE so the cursor doesn't track updates to the tree.
	 */
	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
	F_SET(src, WT_CURSTD_RAW);
	WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks));

	WT_WITH_SCHEMA_LOCK(session, ret,
	    ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
	WT_ERR(ret);
	if (create_bloom) {
		WT_ERR(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk));

		WT_ERR(__wt_bloom_create(session, chunk->bloom_uri,
		    lsm_tree->bloom_config,
		    record_count, lsm_tree->bloom_bit_count,
		    lsm_tree->bloom_hash_count, &bloom));
	}

	/* Discard pages we read as soon as we're done with them. */
	F_SET(session, WT_SESSION_NO_CACHE);

	cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor);
	cfg[1] = "bulk,raw,skip_sort_check";
	cfg[2] = NULL;
	WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));

#define	LSM_MERGE_CHECK_INTERVAL	WT_THOUSAND
	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
		if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) {
			if (!lsm_tree->active)
				WT_ERR(EINTR);

			WT_STAT_CONN_INCRV(session,
			    lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL);
			++lsm_tree->merge_progressing;
		}

		WT_ERR(src->get_key(src, &key));
		dest->set_key(dest, &key);
		WT_ERR(src->get_value(src, &value));
		dest->set_value(dest, &value);
		WT_ERR(dest->insert(dest));
		if (create_bloom)
			__wt_bloom_insert(bloom, &key);
	}
	WT_ERR_NOTFOUND_OK(ret);

	WT_STAT_CONN_INCRV(session,
	    lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL);
	++lsm_tree->merge_progressing;
	__wt_verbose(session, WT_VERB_LSM,
	    "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted",
	    record_count, insert_count);

	/*
	 * Closing and syncing the files can take a while.  Set the
	 * merge_syncing field so that compact knows it is still in
	 * progress.
	 */
	(void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1);
	in_sync = true;
	/*
	 * We've successfully created the new chunk.  Now install it.  We need
	 * to ensure that the NO_CACHE flag is cleared and the bloom filter
	 * is closed (even if a step fails), so track errors but don't return
	 * until we've cleaned up.
	 */
	WT_TRET(src->close(src));
	WT_TRET(dest->close(dest));
	src = dest = NULL;

	F_CLR(session, WT_SESSION_NO_CACHE);

	/*
	 * We're doing advisory reads to fault the new trees into cache.
	 * Don't block if the cache is full: our next unit of work may be to
	 * discard some trees to free space.
	 */
	F_SET(session, WT_SESSION_NO_EVICTION);

	if (create_bloom) {
		if (ret == 0)
			WT_TRET(__wt_bloom_finalize(bloom));

		/*
		 * Read in a key to make sure the Bloom filters btree handle is
		 * open before it becomes visible to application threads.
		 * Otherwise application threads will stall while it is opened
		 * and internal pages are read into cache.
		 */
		if (ret == 0) {
			WT_CLEAR(key);
			WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
		}

		WT_TRET(__wt_bloom_close(bloom));
		bloom = NULL;
	}
	WT_ERR(ret);

	/*
	 * Open a handle on the new chunk before application threads attempt
	 * to access it, opening it pre-loads internal pages into the file
	 * system cache.
	 */
	cfg[1] = "checkpoint=" WT_CHECKPOINT;
	WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
	WT_TRET(dest->close(dest));
	dest = NULL;
	++lsm_tree->merge_progressing;
	(void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1);
	in_sync = false;
	WT_ERR_NOTFOUND_OK(ret);

	WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
	__wt_lsm_tree_writelock(session, lsm_tree);
	locked = true;

	/*
	 * Check whether we raced with another merge, and adjust the chunk
	 * array offset as necessary.
	 */
	if (start_chunk >= lsm_tree->nchunks ||
	    lsm_tree->chunk[start_chunk]->id != start_id)
		for (start_chunk = 0;
		    start_chunk < lsm_tree->nchunks;
		    start_chunk++)
			if (lsm_tree->chunk[start_chunk]->id == start_id)
				break;

	/*
	 * It is safe to error out here - since the update can only fail
	 * prior to making updates to the tree.
	 */
	WT_ERR(__wt_lsm_merge_update_tree(
	    session, lsm_tree, start_chunk, nchunks, chunk));

	if (create_bloom)
		F_SET(chunk, WT_LSM_CHUNK_BLOOM);
	chunk->count = insert_count;
	chunk->generation = generation;
	F_SET(chunk, WT_LSM_CHUNK_ONDISK);

	/*
	 * We have no current way of continuing if the metadata update fails,
	 * so we will panic in that case.  Put some effort into cleaning up
	 * after ourselves here - so things have a chance of shutting down.
	 *
	 * Any errors that happened after the tree was locked are
	 * fatal - we can't guarantee the state of the tree.
	 */
	if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0)
		WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge");

	lsm_tree->dsk_gen++;

	/* Update the throttling while holding the tree lock. */
	__wt_lsm_tree_throttle(session, lsm_tree, true);

	/* Schedule a pass to discard old chunks */
	WT_ERR(__wt_lsm_manager_push_entry(
	    session, WT_LSM_WORK_DROP, 0, lsm_tree));

err:	if (locked)
		__wt_lsm_tree_writeunlock(session, lsm_tree);
	if (in_sync)
		(void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1);
	if (src != NULL)
		WT_TRET(src->close(src));
	if (dest != NULL)
		WT_TRET(dest->close(dest));
	if (bloom != NULL)
		WT_TRET(__wt_bloom_close(bloom));
	if (ret != 0 && created_chunk) {
		/* Drop the newly-created files on error. */
		if (chunk->uri != NULL) {
			WT_WITH_SCHEMA_LOCK(session, tret,
			    tret = __wt_schema_drop(
			    session, chunk->uri, drop_cfg));
			WT_TRET(tret);
		}
		if (create_bloom && chunk->bloom_uri != NULL) {
			WT_WITH_SCHEMA_LOCK(session, tret,
			    tret = __wt_schema_drop(
			    session, chunk->bloom_uri, drop_cfg));
			WT_TRET(tret);
		}
		__wt_free(session, chunk->bloom_uri);
		__wt_free(session, chunk->uri);
		__wt_free(session, chunk);

		if (ret == EINTR)
			__wt_verbose(session, WT_VERB_LSM,
			    "Merge aborted due to close");
		else
			__wt_verbose(session, WT_VERB_LSM,
			    "Merge failed with %s",
			   __wt_strerror(session, ret, NULL, 0));
	}
	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
	return (ret);
}
コード例 #6
0
ファイル: bt_delete.c プロジェクト: adityavs/wiredtiger
/*
 * __wt_delete_page_rollback --
 *	Abort pages that were deleted without being instantiated.
 */
void
__wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_UPDATE **upd;
	uint64_t sleep_count, yield_count;

	/*
	 * If the page is still "deleted", it's as we left it, reset the state
	 * to on-disk and we're done.  Otherwise, we expect the page is either
	 * instantiated or being instantiated.  Loop because it's possible for
	 * the page to return to the deleted state if instantiation fails.
	 */
	for (sleep_count = yield_count = 0;;) {
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_LOOKASIDE:
		case WT_REF_READING:
			WT_ASSERT(session, 0);		/* Impossible, assert */
			break;
		case WT_REF_DELETED:
			/*
			 * If the page is still "deleted", it's as we left it,
			 * reset the state.
			 */
			if (__wt_atomic_casv32(
			    &ref->state, WT_REF_DELETED, WT_REF_DISK))
				return;
			break;
		case WT_REF_LOCKED:
			/*
			 * A possible state, the page is being instantiated.
			 */
			break;
		case WT_REF_MEM:
		case WT_REF_SPLIT:
			/*
			 * We can't use the normal read path to get a copy of
			 * the page because the session may have closed the
			 * cursor, we no longer have the reference to the tree
			 * required for a hazard pointer.  We're safe because
			 * with unresolved transactions, the page isn't going
			 * anywhere.
			 *
			 * The page is in an in-memory state, walk the list of
			 * update structures and abort them.
			 */
			for (upd =
			    ref->page_del->update_list; *upd != NULL; ++upd)
				(*upd)->txnid = WT_TXN_ABORTED;

			/*
			 * Discard the memory, the transaction can't abort
			 * twice.
			 */
			__wt_free(session, ref->page_del->update_list);
			__wt_free(session, ref->page_del);
			return;
		}
		/*
		 * We wait for the change in page state, yield before retrying,
		 * and if we've yielded enough times, start sleeping so we don't
		 * burn CPU to no purpose.
		 */
		__wt_ref_state_yield_sleep(&yield_count, &sleep_count);
		WT_STAT_CONN_INCRV(session, page_del_rollback_blocked,
		    sleep_count);
	}
}
コード例 #7
0
/*
 * __log_slot_close --
 *	Close out the slot the caller is using.  The slot may already be
 *	closed or freed by another thread.
 */
static int
__log_slot_close(
    WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *releasep, bool forced)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;
	int64_t end_offset, new_state, old_state;
#ifdef	HAVE_DIAGNOSTIC
	uint64_t time_start, time_stop;
	int count;
#endif

	*releasep = false;

	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
	conn = S2C(session);
	log = conn->log;
	if (slot == NULL)
		return (WT_NOTFOUND);
retry:
	old_state = slot->slot_state;
	/*
	 * If this close is coming from a forced close and a thread is in
	 * the middle of using the slot, return EBUSY.  The caller can
	 * decide if retrying is necessary or not.
	 */
	if (forced && WT_LOG_SLOT_INPROGRESS(old_state))
		return (__wt_set_return(session, EBUSY));
	/*
	 * If someone else is switching out this slot we lost.  Nothing to
	 * do but return.  Return WT_NOTFOUND anytime the given slot was
	 * processed by another closing thread.  Only return 0 when we
	 * actually closed the slot.
	 */
	if (WT_LOG_SLOT_CLOSED(old_state)) {
		WT_STAT_CONN_INCR(session, log_slot_close_race);
		return (WT_NOTFOUND);
	}
	/*
	 * If someone completely processed this slot, we're done.
	 */
	if (FLD_LOG_SLOT_ISSET(
	    (uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED)) {
		WT_STAT_CONN_INCR(session, log_slot_close_race);
		return (WT_NOTFOUND);
	}
	new_state = (old_state | WT_LOG_SLOT_CLOSE);
	/*
	 * Close this slot.  If we lose the race retry.
	 */
	if (!__wt_atomic_casiv64(&slot->slot_state, old_state, new_state))
		goto retry;
	/*
	 * We own the slot now.  No one else can join.
	 * Set the end LSN.
	 */
	WT_STAT_CONN_INCR(session, log_slot_closes);
	if (WT_LOG_SLOT_DONE(new_state))
		*releasep = true;
	slot->slot_end_lsn = slot->slot_start_lsn;
	/*
	 * A thread setting the unbuffered flag sets the unbuffered size after
	 * setting the flag.  There could be a delay between a thread setting
	 * the flag, a thread closing the slot, and the original thread setting
	 * that value.  If the state is unbuffered, wait for the unbuffered
	 * size to be set.
	 */
#ifdef	HAVE_DIAGNOSTIC
	count = 0;
	time_start = __wt_clock(session);
#endif
	if (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state)) {
		while (slot->slot_unbuffered == 0) {
			WT_STAT_CONN_INCR(session, log_slot_close_unbuf);
			__wt_yield();
#ifdef	HAVE_DIAGNOSTIC
			++count;
			if (count > WT_MILLION) {
				time_stop = __wt_clock(session);
				if (WT_CLOCKDIFF_SEC(
				    time_stop, time_start) > 10) {
					__wt_errx(session, "SLOT_CLOSE: Slot %"
					PRIu32 " Timeout unbuffered, state 0x%"
					PRIx64 " unbuffered %" PRId64,
					(uint32_t)(slot - &log->slot_pool[0]),
					(uint64_t)slot->slot_state,
					slot->slot_unbuffered);
					__log_slot_dump(session);
					__wt_abort(session);
				}
				count = 0;
			}
#endif
		}
	}

	end_offset =
	    WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered;
	slot->slot_end_lsn.l.offset += (uint32_t)end_offset;
	WT_STAT_CONN_INCRV(session, log_slot_consolidated, end_offset);
	/*
	 * XXX Would like to change so one piece of code advances the LSN.
	 */
	log->alloc_lsn = slot->slot_end_lsn;
	WT_ASSERT(session, log->alloc_lsn.l.file >= log->write_lsn.l.file);
	return (0);
}
コード例 #8
0
/*
 * __wt_log_slot_join --
 *	Join a consolidated logging slot.
 */
void
__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
    uint32_t flags, WT_MYSLOT *myslot)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;
	WT_LOGSLOT *slot;
	uint64_t time_start, time_stop, usecs;
	int64_t flag_state, new_state, old_state, released;
	int32_t join_offset, new_join, wait_cnt;
	bool closed, diag_yield, raced, slept, unbuffered, yielded;

	conn = S2C(session);
	log = conn->log;
	time_start = time_stop = 0;

	WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT));
	WT_ASSERT(session, mysize != 0);

	/*
	 * There should almost always be a slot open.
	 */
	unbuffered = yielded = false;
	closed = raced = slept = false;
	wait_cnt = 0;
#ifdef	HAVE_DIAGNOSTIC
	diag_yield = (++log->write_calls % 7) == 0;
	if ((log->write_calls % WT_THOUSAND) == 0 ||
	    mysize > WT_LOG_SLOT_BUF_MAX) {
#else
	diag_yield = false;
	if (mysize > WT_LOG_SLOT_BUF_MAX) {
#endif
		unbuffered = true;
		F_SET(myslot, WT_MYSLOT_UNBUFFERED);
	}
	for (;;) {
		WT_BARRIER();
		slot = log->active_slot;
		old_state = slot->slot_state;
		if (WT_LOG_SLOT_OPEN(old_state)) {
			/*
			 * Try to join our size into the existing size and
			 * atomically write it back into the state.
			 */
			flag_state = WT_LOG_SLOT_FLAGS(old_state);
			released = WT_LOG_SLOT_RELEASED(old_state);
			join_offset = WT_LOG_SLOT_JOINED(old_state);
			if (unbuffered)
				new_join = join_offset + WT_LOG_SLOT_UNBUFFERED;
			else
				new_join = join_offset + (int32_t)mysize;
			new_state = (int64_t)WT_LOG_SLOT_JOIN_REL(
			    (int64_t)new_join, (int64_t)released,
			    (int64_t)flag_state);

			/*
			 * Braces used due to potential empty body warning.
			 */
			if (diag_yield) {
				WT_DIAGNOSTIC_YIELD;
			}
			/*
			 * Attempt to swap our size into the state.
			 */
			if (__wt_atomic_casiv64(
			    &slot->slot_state, old_state, new_state))
				break;
			WT_STAT_CONN_INCR(session, log_slot_races);
			raced = true;
		} else {
			WT_STAT_CONN_INCR(session, log_slot_active_closed);
			closed = true;
			++wait_cnt;
		}
		if (!yielded)
			time_start = __wt_clock(session);
		yielded = true;
		/*
		 * The slot is no longer open or we lost the race to
		 * update it.  Yield and try again.
		 */
		if (wait_cnt < WT_THOUSAND)
			__wt_yield();
		else {
			__wt_sleep(0, WT_THOUSAND);
			slept = true;
		}
	}
	/*
	 * We joined this slot.  Fill in our information to return to
	 * the caller.
	 */
	if (!yielded)
		WT_STAT_CONN_INCR(session, log_slot_immediate);
	else {
		WT_STAT_CONN_INCR(session, log_slot_yield);
		time_stop = __wt_clock(session);
		usecs = WT_CLOCKDIFF_US(time_stop, time_start);
		WT_STAT_CONN_INCRV(session, log_slot_yield_duration, usecs);
		if (closed)
			WT_STAT_CONN_INCR(session, log_slot_yield_close);
		if (raced)
			WT_STAT_CONN_INCR(session, log_slot_yield_race);
		if (slept)
			WT_STAT_CONN_INCR(session, log_slot_yield_sleep);
	}
	if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
		F_SET(slot, WT_SLOT_SYNC_DIR);
	if (LF_ISSET(WT_LOG_FLUSH))
		F_SET(slot, WT_SLOT_FLUSH);
	if (LF_ISSET(WT_LOG_FSYNC))
		F_SET(slot, WT_SLOT_SYNC);
	if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) {
		WT_ASSERT(session, slot->slot_unbuffered == 0);
		WT_STAT_CONN_INCR(session, log_slot_unbuffered);
		slot->slot_unbuffered = (int64_t)mysize;
	}
	myslot->slot = slot;
	myslot->offset = join_offset;
	myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize);
}

/*
 * __wt_log_slot_release --
 *	Each thread in a consolidated group releases its portion to
 *	signal it has completed copying its piece of the log into
 *	the memory buffer.
 */
int64_t
__wt_log_slot_release(WT_MYSLOT *myslot, int64_t size)
{
	WT_LOGSLOT *slot;
	wt_off_t cur_offset, my_start;
	int64_t my_size, rel_size;

	slot = myslot->slot;
	my_start = slot->slot_start_offset + myslot->offset;
	/*
	 * We maintain the last starting offset within this slot.
	 * This is used to know the offset of the last record that
	 * was written rather than the beginning record of the slot.
	 */
	while ((cur_offset = slot->slot_last_offset) < my_start) {
		/*
		 * Set our offset if we are larger.
		 */
		if (__wt_atomic_casiv64(
		    &slot->slot_last_offset, cur_offset, my_start))
			break;
		/*
		 * If we raced another thread updating this, try again.
		 */
		WT_BARRIER();
	}
	/*
	 * Add my size into the state and return the new size.
	 */
	rel_size = size;
	if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED))
		rel_size = WT_LOG_SLOT_UNBUFFERED;
	my_size = (int64_t)WT_LOG_SLOT_JOIN_REL((int64_t)0, rel_size, 0);
	return (__wt_atomic_addiv64(&slot->slot_state, my_size));
}
コード例 #9
0
ファイル: conn_log.c プロジェクト: mpobrien/mongo
/*
 * __log_file_server --
 *	The log file server thread.  This worker thread manages
 *	log file operations such as closing and syncing.
 */
static WT_THREAD_RET
__log_file_server(void *arg)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *close_fh;
	WT_LOG *log;
	WT_LSN close_end_lsn, min_lsn;
	WT_SESSION_IMPL *session;
	uint64_t yield_count;
	uint32_t filenum;
	bool locked;

	session = arg;
	conn = S2C(session);
	log = conn->log;
	locked = false;
	yield_count = 0;
	while (F_ISSET(conn, WT_CONN_SERVER_LOG)) {
		/*
		 * If there is a log file to close, make sure any outstanding
		 * write operations have completed, then fsync and close it.
		 */
		if ((close_fh = log->log_close_fh) != NULL) {
			WT_ERR(__wt_log_extract_lognum(session, close_fh->name,
			    &filenum));
			/*
			 * The closing file handle should have a correct close
			 * LSN.
			 */
			WT_ASSERT(session,
			    log->log_close_lsn.l.file == filenum);

			if (__wt_log_cmp(
			    &log->write_lsn, &log->log_close_lsn) >= 0) {
				/*
				 * We've copied the file handle, clear out the
				 * one in the log structure to allow it to be
				 * set again.  Copy the LSN before clearing
				 * the file handle.
				 * Use a barrier to make sure the compiler does
				 * not reorder the following two statements.
				 */
				close_end_lsn = log->log_close_lsn;
				WT_FULL_BARRIER();
				log->log_close_fh = NULL;
				/*
				 * Set the close_end_lsn to the LSN immediately
				 * after ours.  That is, the beginning of the
				 * next log file.   We need to know the LSN
				 * file number of our own close in case earlier
				 * calls are still in progress and the next one
				 * to move the sync_lsn into the next file for
				 * later syncs.
				 */
				WT_ERR(__wt_fsync(session, close_fh, true));

				/*
				 * We want to have the file size reflect actual
				 * data with minimal pre-allocated zeroed space.
				 * We can't truncate the file during hot backup,
				 * or the underlying file system may not support
				 * truncate: both are OK, it's just more work
				 * during cursor traversal.
				 */
				if (!conn->hot_backup) {
					__wt_readlock(
					    session, &conn->hot_backup_lock);
					if (!conn->hot_backup)
						WT_ERR_ERROR_OK(
						    __wt_ftruncate(session,
						    close_fh,
						    close_end_lsn.l.offset),
						    ENOTSUP);
					__wt_readunlock(
					    session, &conn->hot_backup_lock);
				}
				WT_SET_LSN(&close_end_lsn,
				    close_end_lsn.l.file + 1, 0);
				__wt_spin_lock(session, &log->log_sync_lock);
				locked = true;
				WT_ERR(__wt_close(session, &close_fh));
				WT_ASSERT(session, __wt_log_cmp(
				    &close_end_lsn, &log->sync_lsn) >= 0);
				log->sync_lsn = close_end_lsn;
				__wt_cond_signal(session, log->log_sync_cond);
				locked = false;
				__wt_spin_unlock(session, &log->log_sync_lock);
			}
		}
		/*
		 * If a later thread asked for a background sync, do it now.
		 */
		if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) {
			/*
			 * Save the latest write LSN which is the minimum
			 * we will have written to disk.
			 */
			min_lsn = log->write_lsn;
			/*
			 * We have to wait until the LSN we asked for is
			 * written.  If it isn't signal the wrlsn thread
			 * to get it written.
			 *
			 * We also have to wait for the written LSN and the
			 * sync LSN to be in the same file so that we know we
			 * have synchronized all earlier log files.
			 */
			if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) {
				/*
				 * If the sync file is behind either the one
				 * wanted for a background sync or the write LSN
				 * has moved to another file continue to let
				 * this worker thread process that older file
				 * immediately.
				 */
				if ((log->sync_lsn.l.file <
				    log->bg_sync_lsn.l.file) ||
				    (log->sync_lsn.l.file < min_lsn.l.file))
					continue;
				WT_ERR(__wt_fsync(session, log->log_fh, true));
				__wt_spin_lock(session, &log->log_sync_lock);
				locked = true;
				/*
				 * The sync LSN could have advanced while we
				 * were writing to disk.
				 */
				if (__wt_log_cmp(
				    &log->sync_lsn, &min_lsn) <= 0) {
					WT_ASSERT(session,
					    min_lsn.l.file ==
					    log->sync_lsn.l.file);
					log->sync_lsn = min_lsn;
					__wt_cond_signal(
					    session, log->log_sync_cond);
				}
				locked = false;
				__wt_spin_unlock(session, &log->log_sync_lock);
			} else {
				__wt_cond_signal(session, conn->log_wrlsn_cond);
				/*
				 * We do not want to wait potentially a second
				 * to process this.  Yield to give the wrlsn
				 * thread a chance to run and try again in
				 * this case.
				 */
				yield_count++;
				__wt_yield();
				continue;
			}
		}

		/* Wait until the next event. */
		__wt_cond_wait(session, conn->log_file_cond, 100000, NULL);
	}

	if (0) {
err:		WT_PANIC_MSG(session, ret, "log close server error");
	}
	WT_STAT_CONN_INCRV(session, log_server_sync_blocked, yield_count);
	if (locked)
		__wt_spin_unlock(session, &log->log_sync_lock);
	return (WT_THREAD_RET_VALUE);
}
コード例 #10
0
ファイル: block_write.c プロジェクト: mongodb/mongo
/*
 * __block_write_off --
 *	Write a buffer into a block, returning the block's offset, size and
 * checksum.
 */
static int
__block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump,
    bool data_checksum, bool checkpoint_io, bool caller_locked)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_RET;
	WT_FH *fh;
	wt_off_t offset;
	size_t align_size;
	uint32_t checksum;
	bool local_locked;

	*offsetp = 0;			/* -Werror=maybe-uninitialized */
	*sizep = 0;			/* -Werror=maybe-uninitialized */
	*checksump = 0;			/* -Werror=maybe-uninitialized */

	fh = block->fh;

	/*
	 * Clear the block header to ensure all of it is initialized, even the
	 * unused fields.
	 */
	blk = WT_BLOCK_HEADER_REF(buf->mem);
	memset(blk, 0, sizeof(*blk));

	/* Buffers should be aligned for writing. */
	if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
		WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
		WT_RET_MSG(session, EINVAL,
		    "direct I/O check: write buffer incorrectly allocated");
	}

	/*
	 * Align the size to an allocation unit.
	 *
	 * The buffer must be big enough for us to zero to the next allocsize
	 * boundary, this is one of the reasons the btree layer must find out
	 * from the block-manager layer the maximum size of the eventual write.
	 */
	align_size = WT_ALIGN(buf->size, block->allocsize);
	if (align_size > buf->memsize) {
		WT_ASSERT(session, align_size <= buf->memsize);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer incorrectly allocated");
	}
	if (align_size > UINT32_MAX) {
		WT_ASSERT(session, align_size <= UINT32_MAX);
		WT_RET_MSG(session, EINVAL,
		    "buffer size check: write buffer too large to write");
	}

	/* Zero out any unused bytes at the end of the buffer. */
	memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size);

	/*
	 * Set the disk size so we don't have to incrementally read blocks
	 * during salvage.
	 */
	blk->disk_size = WT_STORE_SIZE(align_size);

	/*
	 * Update the block's checksum: if our caller specifies, checksum the
	 * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP
	 * bytes.  The assumption is applications with good compression support
	 * turn off checksums and assume corrupted blocks won't decompress
	 * correctly.  However, if compression failed to shrink the block, the
	 * block wasn't compressed, in which case our caller will tell us to
	 * checksum the data to detect corruption. If compression succeeded,
	 * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes
	 * because they're not compressed, both to give salvage a quick test
	 * of whether a block is useful and to give us a test so we don't lose
	 * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing.
	 *
	 * Checksum a little-endian version of the header, and write everything
	 * in little-endian format. The checksum is (potentially) returned in a
	 * big-endian format, swap it into place in a separate step.
	 */
	blk->flags = 0;
	if (data_checksum)
		F_SET(blk, WT_BLOCK_DATA_CKSUM);
	blk->checksum = 0;
	__wt_block_header_byteswap(blk);
	blk->checksum = checksum = __wt_checksum(
	    buf->mem, data_checksum ? align_size : WT_BLOCK_COMPRESS_SKIP);
#ifdef WORDS_BIGENDIAN
	blk->checksum = __wt_bswap32(blk->checksum);
#endif

	/* Pre-allocate some number of extension structures. */
	WT_RET(__wt_block_ext_prealloc(session, 5));

	/*
	 * Acquire a lock, if we don't already hold one.
	 * Allocate space for the write, and optionally extend the file (note
	 * the block-extend function may release the lock).
	 * Release any locally acquired lock.
	 */
	local_locked = false;
	if (!caller_locked) {
		__wt_spin_lock(session, &block->live_lock);
		local_locked = true;
	}
	ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size);
	if (ret == 0)
		ret = __wt_block_extend(
		    session, block, fh, offset, align_size, &local_locked);
	if (local_locked)
		__wt_spin_unlock(session, &block->live_lock);
	WT_RET(ret);

	/* Write the block. */
	if ((ret =
	    __wt_write(session, fh, offset, align_size, buf->mem)) != 0) {
		if (!caller_locked)
			__wt_spin_lock(session, &block->live_lock);
		WT_TRET(__wt_block_off_free(
		    session, block, offset, (wt_off_t)align_size));
		if (!caller_locked)
			__wt_spin_unlock(session, &block->live_lock);
		WT_RET(ret);
	}

	/*
	 * Optionally schedule writes for dirty pages in the system buffer
	 * cache, but only if the current session can wait.
	 */
	if (block->os_cache_dirty_max != 0 &&
	    fh->written > block->os_cache_dirty_max &&
	    __wt_session_can_wait(session)) {
		fh->written = 0;
		if ((ret = __wt_fsync(session, fh, false)) != 0) {
			 /*
			  * Ignore ENOTSUP, but don't try again.
			  */
			if (ret != ENOTSUP)
				return (ret);
			block->os_cache_dirty_max = 0;
		}
	}

	/* Optionally discard blocks from the buffer cache. */
	WT_RET(__wt_block_discard(session, block, align_size));

	WT_STAT_CONN_INCR(session, block_write);
	WT_STAT_CONN_INCRV(session, block_byte_write, align_size);
	if (checkpoint_io)
		WT_STAT_CONN_INCRV(
		    session, block_byte_write_checkpoint, align_size);

	__wt_verbose(session, WT_VERB_WRITE,
	    "off %" PRIuMAX ", size %" PRIuMAX ", checksum %#" PRIx32,
	    (uintmax_t)offset, (uintmax_t)align_size, checksum);

	*offsetp = offset;
	*sizep = WT_STORE_SIZE(align_size);
	*checksump = checksum;

	return (0);
}