Esempio n. 1
0
/*
 * __wt_cond_signal --
 *	Signal a waiting thread.
 */
int
__wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
{
	WT_DECL_RET;
	int locked;

	locked = 0;

	/*
	 * !!!
	 * This function MUST handle a NULL session handle.
	 */
	if (session != NULL && WT_VERBOSE_ISSET(session, mutex))
		WT_RET(__wt_verbose(
		    session, "signal %s cond (%p)", cond->name, cond));

	/* Fast path if already signalled. */
	if (cond->waiters == -1)
		return (0);

	if (cond->waiters > 0 || !WT_ATOMIC_CAS(cond->waiters, 0, -1)) {
		WT_ERR(pthread_mutex_lock(&cond->mtx));
		locked = 1;
		WT_ERR(pthread_cond_broadcast(&cond->cond));
	}

err:	if (locked)
		WT_TRET(pthread_mutex_unlock(&cond->mtx));
	if (ret == 0)
		return (0);
	WT_RET_MSG(session, ret, "pthread_cond_broadcast");
}
Esempio n. 2
0
/*
 * __wt_rec_track_init --
 *	Initialize the page's list of tracked objects when reconciliation
 * starts.
 */
int
__wt_rec_track_init(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	if (WT_VERBOSE_ISSET(session, reconcile))
		WT_RET(__track_dump(session, page, "reconcile init"));
	return (0);
}
Esempio n. 3
0
/*
 * __wt_ovfl_reuse_search --
 *	Search the page's list of overflow records for a match.
 */
int
__wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page,
    uint8_t **addrp, size_t *addr_sizep,
    const void *value, size_t value_size)
{
	WT_OVFL_REUSE **head, *reuse;

	*addrp = NULL;
	*addr_sizep = 0;

	if (page->modify->ovfl_track == NULL)
		return (0);

	head = page->modify->ovfl_track->ovfl_reuse;

	/*
	 * The search function returns the first matching record in the list
	 * which does not have the in-use flag set, or NULL.
	 */
	if ((reuse = __ovfl_reuse_skip_search(head, value, value_size)) == NULL)
		return (0);

	*addrp = WT_OVFL_REUSE_ADDR(reuse);
	*addr_sizep = reuse->addr_size;
	F_SET(reuse, WT_OVFL_REUSE_INUSE);

	if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
		WT_RET(__ovfl_reuse_verbose(session, page, reuse, "reclaim"));
	return (1);
}
Esempio n. 4
0
/*
 * __wt_cond_signal --
 *	Signal a waiting thread.
 */
int
__wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
{
	WT_DECL_RET;
	int locked;

	locked = 0;

	/*
	 * !!!
	 * This function MUST handle a NULL session handle.
	 */
	if (session != NULL && WT_VERBOSE_ISSET(session, mutex))
		WT_RET(__wt_verbose(
		    session, "signal %s cond (%p)", cond->name, cond));

	WT_ERR(pthread_mutex_lock(&cond->mtx));
	locked = 1;

	if (!cond->signalled) {
		cond->signalled = 1;
		WT_ERR(pthread_cond_signal(&cond->cond));
	}

err:	if (locked)
		WT_TRET(pthread_mutex_unlock(&cond->mtx));
	if (ret == 0)
		return (0);
	WT_RET_MSG(session, ret, "pthread_cond_signal");
}
Esempio n. 5
0
/*
 * __wt_ovfl_txnc_add --
 *	Add a new entry to the page's list of transaction-cached overflow
 * records.
 */
int
__wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page,
    const uint8_t *addr, size_t addr_size,
    const void *value, size_t value_size)
{
	WT_OVFL_TXNC **head, **stack[WT_SKIP_MAXDEPTH], *txnc;
	size_t size;
	u_int i, skipdepth;
	uint8_t *p;

	if (page->modify->ovfl_track == NULL)
		WT_RET(__ovfl_track_init(session, page));

	head = page->modify->ovfl_track->ovfl_txnc;

	/* Choose a skiplist depth for this insert. */
	skipdepth = __wt_skip_choose_depth(session);

	/*
	 * Allocate the WT_OVFL_TXNC structure, next pointers for the skip
	 * list, room for the address and value, then copy everything into
	 * place.
	 *
	 * To minimize the WT_OVFL_TXNC structure size, the address offset
	 * and size are single bytes: that's safe because the address follows
	 * the structure (which can't be more than about 100B), and address
	 * cookies are limited to 255B.
	 */
	size = sizeof(WT_OVFL_TXNC) +
	    skipdepth * sizeof(WT_OVFL_TXNC *) + addr_size + value_size;
	WT_RET(__wt_calloc(session, 1, size, &txnc));
	p = (uint8_t *)txnc +
	    sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *);
	txnc->addr_offset = (uint8_t)WT_PTRDIFF(p, txnc);
	txnc->addr_size = (uint8_t)addr_size;
	memcpy(p, addr, addr_size);
	p += addr_size;
	txnc->value_offset = WT_PTRDIFF32(p, txnc);
	txnc->value_size = WT_STORE_SIZE(value_size);
	memcpy(p, value, value_size);
	txnc->current = __wt_txn_new_id(session);

	__wt_cache_page_inmem_incr(
	    session, page, WT_OVFL_SIZE(txnc, WT_OVFL_TXNC));

	/* Insert the new entry into the skiplist. */
	__ovfl_txnc_skip_search_stack(head, stack, addr, addr_size);
	for (i = 0; i < skipdepth; ++i) {
		txnc->next[i] = *stack[i];
		*stack[i] = txnc;
	}

	if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
		WT_RET(__ovfl_txnc_verbose(session, page, txnc, "add"));

	return (0);
}
Esempio n. 6
0
/*
 * __recovery_set_checkpoint_timestamp --
 *	Set the checkpoint timestamp as retrieved from the metadata file.
 */
static int
__recovery_set_checkpoint_timestamp(WT_RECOVERY *r)
{
	WT_CONFIG_ITEM cval;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	wt_timestamp_t ckpt_timestamp;
	char ts_string[WT_TS_INT_STRING_SIZE], *sys_config;

	sys_config = NULL;

	session = r->session;
	conn = S2C(session);
	/*
	 * Read the system checkpoint information from the metadata file and
	 * save the stable timestamp of the last checkpoint for later query.
	 * This gets saved in the connection.
	 */
	ckpt_timestamp = 0;

	/* Search in the metadata for the system information. */
	WT_ERR_NOTFOUND_OK(
	    __wt_metadata_search(session, WT_SYSTEM_CKPT_URI, &sys_config));
	if (sys_config != NULL) {
		WT_CLEAR(cval);
		WT_ERR_NOTFOUND_OK(__wt_config_getones(
		    session, sys_config, "checkpoint_timestamp", &cval));
		if (cval.len != 0) {
			__wt_verbose(session, WT_VERB_RECOVERY,
			    "Recovery timestamp %.*s",
			    (int)cval.len, cval.str);
			WT_ERR(__wt_txn_parse_timestamp_raw(session,
			    "recovery", &ckpt_timestamp, &cval));
		}
	}

	/*
	 * Set the recovery checkpoint timestamp and the metadata checkpoint
	 * timestamp so that the checkpoint after recovery writes the correct
	 * value into the metadata.
	 */
	conn->txn_global.meta_ckpt_timestamp =
	    conn->txn_global.recovery_timestamp = ckpt_timestamp;

	if (WT_VERBOSE_ISSET(session,
	    WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS)) {
		__wt_timestamp_to_string(
		    conn->txn_global.recovery_timestamp, ts_string);
		__wt_verbose(session,
		    WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS,
		    "Set global recovery timestamp: %s", ts_string);
	}
err:	__wt_free(session, sys_config);
	return (ret);
}
Esempio n. 7
0
/*
 * __wt_block_compact_skip --
 *	Return if compaction will shrink the file.
 */
int
__wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp)
{
	WT_DECL_RET;
	WT_EXT *ext;
	WT_EXTLIST *el;
	WT_FH *fh;
	off_t avail, ninety;

	*skipp = 1;				/* Return a default skip. */

	fh = block->fh;

	/*
	 * We do compaction by copying blocks from the end of the file to the
	 * beginning of the file, and we need some metrics to decide if it's
	 * worth doing.  Ignore small files, and files where we are unlikely
	 * to recover 10% of the file.
	 */
	if (fh->size <= 10 * 1024)
		return (0);

	__wt_spin_lock(session, &block->live_lock);

	if (WT_VERBOSE_ISSET(session, compact))
		WT_ERR(__block_dump_avail(session, block));

	/* Sum the number of available bytes in the first 90% of the file. */
	avail = 0;
	ninety = fh->size - fh->size / 10;

	el = &block->live.avail;
	WT_EXT_FOREACH(ext, el->off)
		if (ext->off < ninety)
			avail += ext->size;

	/*
	 * If at least 10% of the total file is available and in the first 90%
	 * of the file, we'll try compaction.
	 */
	if (avail >= fh->size / 10)
		*skipp = 0;

	WT_VERBOSE_ERR(session, compact,
	    "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first "
	    "90%% of the file, require 10%% or %" PRIuMAX "MB (%" PRIuMAX
	    ") to perform compaction, compaction %s",
	    block->name,
	    (uintmax_t)avail / WT_MEGABYTE, (uintmax_t)avail,
	    (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10,
	    *skipp ? "skipped" : "proceeding");

err:	__wt_spin_unlock(session, &block->live_lock);

	return (ret);
}
Esempio n. 8
0
/*
 * __wt_rec_track_ovfl_reuse --
 *	Search for a matching overflow record and reactivate it.
 */
int
__wt_rec_track_ovfl_reuse(
    WT_SESSION_IMPL *session, WT_PAGE *page,
    const void *data, uint32_t data_size,
    uint8_t **addrp, uint32_t *addr_sizep, int *foundp)
{
	WT_PAGE_MODIFY *mod;
	WT_PAGE_TRACK *track;
	uint32_t i;

	*foundp = 0;

	mod = page->modify;
	for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i) {
		/* Ignore empty slots */
		if (!F_ISSET(track, WT_TRK_OBJECT))
			continue;

		/*
		 * Ignore discarded objects, objects already in-use, or cached
		 * overflow values.  We don't care about whether or not the
		 * object came from a page, we can re-use objects from the page
		 * or objects created in a previous reconciliation.
		 */
		if (F_ISSET(track,
		    WT_TRK_DISCARD | WT_TRK_INUSE | WT_TRK_OVFL_VALUE))
			continue;

		/*
		 * Ignore objects without data (must be block objects).  This is
		 * not really necessary (presumably, our caller is matching on a
		 * non-zero-length data item), but paranoia is healthy.
		 */
		if (track->data == NULL)
			continue;

		/* Check to see if the data matches. */
		if (track->size != data_size ||
		    memcmp(data, track->data, data_size) != 0)
			continue;

		/*
		 * Reactivate the record.
		 * Return the block addr/size pair to our caller.
		 */
		F_SET(track, WT_TRK_INUSE);
		*addrp = track->addr.addr;
		*addr_sizep = track->addr.size;
		*foundp = 1;
		if (WT_VERBOSE_ISSET(session, reconcile))
			WT_RET(__track_msg(
			    session, page, "reactivate overflow", track));
		return (0);
	}
	return (0);
}
Esempio n. 9
0
/*
 * __ovfl_txnc_wrapup --
 *	Resolve the page's transaction-cache list.
 */
static int
__ovfl_txnc_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_OVFL_TXNC **e, **head, *txnc;
	uint64_t oldest_txn;
	size_t decr;
	int i;

	head = page->modify->ovfl_track->ovfl_txnc;

	/*
	 * Take a snapshot of the oldest transaction ID we need to keep alive.
	 * Since we do two passes through entries in the structure, the normal
	 * visibility check could give different results as the global ID moves
	 * forward.
	 */
	oldest_txn = __wt_txn_oldest_id(session);

	/*
	 * Discard any transaction-cache records with transaction IDs earlier
	 * than any in the system.
	 *
	 * First, walk the overflow transaction-cache skip lists (except for
	 * the lowest level), fixing up links.
	 */
	for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
		for (e = &head[i]; (txnc = *e) != NULL;) {
			if (WT_TXNID_LE(oldest_txn, txnc->current)) {
				e = &txnc->next[i];
				continue;
			}
			*e = txnc->next[i];
		}

	/* Second, discard any no longer needed transaction-cache records. */
	decr = 0;
	for (e = &head[0]; (txnc = *e) != NULL;) {
		if (WT_TXNID_LE(oldest_txn, txnc->current)) {
			e = &txnc->next[0];
			continue;
		}
		*e = txnc->next[0];

		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
			WT_RET(
			    __ovfl_txnc_verbose(session, page, txnc, "free"));

		decr += WT_OVFL_SIZE(txnc, WT_OVFL_TXNC);
		__wt_free(session, txnc);
	}

	if (decr != 0)
		__wt_cache_page_inmem_decr(session, page, decr);
	return (0);
}
Esempio n. 10
0
/*
 * __ovfl_reuse_wrapup_err --
 *	Resolve the page's overflow reuse list after an error occurs.
 */
static int
__ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_BM *bm;
	WT_DECL_RET;
	WT_OVFL_REUSE **e, **head, *reuse;
	size_t decr;
	int i;

	bm = S2BT(session)->bm;
	head = page->modify->ovfl_track->ovfl_reuse;

	/*
	 * Discard any overflow records that were just added, freeing underlying
	 * blocks.
	 *
	 * First, walk the overflow reuse lists (except for the lowest one),
	 * fixing up skiplist links.
	 */
	for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
		for (e = &head[i]; (reuse = *e) != NULL;) {
			if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) {
				e = &reuse->next[i];
				continue;
			}
			*e = reuse->next[i];
		}

	/*
	 * Second, discard any overflow record with a just-added flag, clear the
	 * flags for the next run.
	 */
	decr = 0;
	for (e = &head[0]; (reuse = *e) != NULL;) {
		if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) {
			F_CLR(reuse, WT_OVFL_REUSE_INUSE);
			e = &reuse->next[0];
			continue;
		}
		*e = reuse->next[0];

		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
			WT_RET(
			    __ovfl_reuse_verbose(session, page, reuse, "free"));

		WT_TRET(bm->free(
		    bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size));
		decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE);
		__wt_free(session, reuse);
	}

	if (decr != 0)
		__wt_cache_page_inmem_decr(session, page, decr);
	return (0);
}
Esempio n. 11
0
/*
 * __ovfl_txnc_wrapup --
 *	Resolve the page's transaction-cache list.
 */
static int
__ovfl_txnc_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_OVFL_TXNC **e, **head, *txnc;
	size_t decr;
	int i;

	head = page->modify->ovfl_track->ovfl_txnc;

	/*
	 * Discard any transaction-cache records with transaction IDs earlier
	 * than any in the system.
	 *
	 * First, walk the overflow transaction-cache skip lists (except for
	 * the lowest level), fixing up links.
	 */
	for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
		for (e = &head[i]; *e != NULL;) {
			if (!__wt_txn_visible_all(session, (*e)->current)) {
				e = &(*e)->next[i];
				continue;
			}
			*e = (*e)->next[i];
		}

	/* Second, discard any no longer needed transaction-cache records. */
	decr = 0;
	for (e = &head[0]; (txnc = *e) != NULL;) {
		if (!__wt_txn_visible_all(session, txnc->current)) {
			e = &(*e)->next[0];
			continue;
		}
		*e = (*e)->next[0];

		decr += WT_OVFL_SIZE(WT_OVFL_TXNC) +
		    txnc->addr_size + txnc->value_size;

		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
			WT_RET(
			    __ovfl_txnc_verbose(session, page, txnc, "free"));
		__wt_free(session, txnc);
	}

	if (decr != 0)
		__wt_cache_page_inmem_decr(session, page, decr);
	return (0);
}
Esempio n. 12
0
/*
 * __wt_ovfl_discard_add --
 *	Add a new entry to the page's list of overflow records that have been
 * discarded.
 */
int
__wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell)
{
	WT_OVFL_TRACK *track;

	if (page->modify->ovfl_track == NULL)
		WT_RET(__ovfl_track_init(session, page));

	track = page->modify->ovfl_track;
	WT_RET(__wt_realloc_def(session, &track->discard_allocated,
	    track->discard_entries + 1, &track->discard));
	track->discard[track->discard_entries++] = cell;

	if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
		WT_RET(__ovfl_discard_verbose(session, page, cell, "add"));

	return (0);
}
Esempio n. 13
0
/*
 * __ovfl_discard_wrapup --
 *	Resolve the page's overflow discard list after a page is written.
 */
static int
__ovfl_discard_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_CELL **cellp;
	WT_DECL_RET;
	WT_OVFL_TRACK *track;
	uint32_t i;

	track = page->modify->ovfl_track;
	for (i = 0, cellp = track->discard;
	    i < track->discard_entries; ++i, ++cellp) {
		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
			WT_RET(__ovfl_discard_verbose(
			    session, page, *cellp, "free"));

		/* Discard each cell's overflow item. */
		WT_RET(__wt_ovfl_discard(session, *cellp));
	}

	__wt_free(session, track->discard);
	track->discard_entries = track->discard_allocated = 0;

	return (ret);
}
Esempio n. 14
0
/*
 * __sync_file --
 *	Flush pages for a specific file.
 */
static int
__sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
{
	struct timespec end, start;
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_REF *walk;
	WT_TXN *txn;
	uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
	uint64_t oldest_id, saved_snap_min;
	uint32_t flags;

	conn = S2C(session);
	btree = S2BT(session);
	walk = NULL;
	txn = &session->txn;
	saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min;
	flags = WT_READ_CACHE | WT_READ_NO_GEN;

	internal_bytes = leaf_bytes = 0;
	internal_pages = leaf_pages = 0;
	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
		WT_RET(__wt_epoch(session, &start));

	switch (syncop) {
	case WT_SYNC_WRITE_LEAVES:
		/*
		 * Write all immediately available, dirty in-cache leaf pages.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.
		 */
		if (!btree->modified)
			return (0);
		__wt_spin_lock(session, &btree->flush_lock);
		if (!btree->modified) {
			__wt_spin_unlock(session, &btree->flush_lock);
			return (0);
		}

		/*
		 * Save the oldest transaction ID we need to keep around.
		 * Otherwise, in a busy system, we could be updating pages so
		 * fast that write leaves never catches up.  We deliberately
		 * have no transaction running at this point that would keep
		 * the oldest ID from moving forwards as we walk the tree.
		 */
		oldest_id = __wt_txn_oldest_id(session);

		flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
		for (walk = NULL;;) {
			WT_ERR(__wt_tree_walk(session, &walk, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages if nobody beat us to it.  Don't
			 * try to write hot pages (defined as pages that have
			 * been updated since the write phase leaves started):
			 * checkpoint will have to visit them anyway.
			 */
			page = walk->page;
			if (__wt_page_is_modified(page) &&
			    WT_TXNID_LT(page->modify->update_txn, oldest_id)) {
				if (txn->isolation == WT_ISO_READ_COMMITTED)
					__wt_txn_get_snapshot(session);
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
				WT_ERR(__wt_reconcile(session, walk, NULL, 0));
			}
		}
		break;
	case WT_SYNC_CHECKPOINT:
		/*
		 * If we are flushing a file at read-committed isolation, which
		 * is of particular interest for flushing the metadata to make
		 * schema-changing operation durable, get a transactional
		 * snapshot now.
		 *
		 * All changes committed up to this point should be included.
		 * We don't update the snapshot in between pages because (a)
		 * the metadata shouldn't be that big, and (b) if we do ever
		 */
		if (txn->isolation == WT_ISO_READ_COMMITTED)
			__wt_txn_get_snapshot(session);

		/*
		 * We cannot check the tree modified flag in the case of a
		 * checkpoint, the checkpoint code has already cleared it.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.  We're holding the schema lock, but need the
		 * lower-level lock as well.
		 */
		__wt_spin_lock(session, &btree->flush_lock);

		/*
		 * In the final checkpoint pass, child pages cannot be evicted
		 * from underneath internal pages nor can underlying blocks be
		 * freed until the checkpoint's block lists are stable. Also,
		 * we cannot split child pages into parents unless we know the
		 * final pass will write a consistent view of that namespace.
		 * Set the checkpointing flag to block such actions and wait for
		 * any problematic eviction or page splits to complete.
		 */
		WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE);

		WT_ERR(__wt_evict_file_exclusive_on(session));
		__wt_evict_file_exclusive_off(session);

		WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING);

		/* Write all dirty in-cache pages. */
		flags |= WT_READ_NO_EVICT;
		for (walk = NULL;;) {
			WT_ERR(__wt_tree_walk(session, &walk, flags));
			if (walk == NULL)
				break;

			/* Skip clean pages. */
			if (!__wt_page_is_modified(walk->page))
				continue;

			/*
			 * Take a local reference to the page modify structure
			 * now that we know the page is dirty. It needs to be
			 * done in this order otherwise the page modify
			 * structure could have been created between taking the
			 * reference and checking modified.
			 */
			page = walk->page;
			mod = page->modify;

			/*
			 * Write dirty pages, unless we can be sure they only
			 * became dirty after the checkpoint started.
			 *
			 * We can skip dirty pages if:
			 * (1) they are leaf pages;
			 * (2) there is a snapshot transaction active (which
			 *     is the case in ordinary application checkpoints
			 *     but not all internal cases); and
			 * (3) the first dirty update on the page is
			 *     sufficiently recent that the checkpoint
			 *     transaction would skip them.
			 *
			 * Mark the tree dirty: the checkpoint marked it clean
			 * and we can't skip future checkpoints until this page
			 * is written.
			 */
			if (!WT_PAGE_IS_INTERNAL(page) &&
			    F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
			    WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) {
				__wt_page_modify_set(session, page);
				continue;
			}

			if (WT_PAGE_IS_INTERNAL(page)) {
				internal_bytes += page->memory_footprint;
				++internal_pages;
			} else {
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
			}
			WT_ERR(__wt_reconcile(session, walk, NULL, 0));
		}
		break;
	case WT_SYNC_CLOSE:
	case WT_SYNC_DISCARD:
	WT_ILLEGAL_VALUE_ERR(session);
	}

	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		WT_ERR(__wt_epoch(session, &end));
		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of internal\n\t"
		    "Took: %" PRIu64 "ms",
		    syncop == WT_SYNC_WRITE_LEAVES ?
		    "WRITE_LEAVES" : "CHECKPOINT",
		    leaf_bytes, leaf_pages, internal_bytes, internal_pages,
		    WT_TIMEDIFF_MS(end, start)));
	}

err:	/* On error, clear any left-over tree walk. */
	if (walk != NULL)
		WT_TRET(__wt_page_release(session, walk, flags));

	/*
	 * If we got a snapshot in order to write pages, and there was no
	 * snapshot active when we started, release it.
	 */
	if (txn->isolation == WT_ISO_READ_COMMITTED &&
	    saved_snap_min == WT_TXN_NONE)
		__wt_txn_release_snapshot(session);

	if (btree->checkpointing != WT_CKPT_OFF) {
		/*
		 * Update the checkpoint generation for this handle so visible
		 * updates newer than the checkpoint can be evicted.
		 *
		 * This has to be published before eviction is enabled again,
		 * so that eviction knows that the checkpoint has completed.
		 */
		WT_PUBLISH(btree->checkpoint_gen,
		    conn->txn_global.checkpoint_gen);
		WT_STAT_FAST_DATA_SET(session,
		    btree_checkpoint_generation, btree->checkpoint_gen);

		/*
		 * Clear the checkpoint flag and push the change; not required,
		 * but publishing the change means stalled eviction gets moving
		 * as soon as possible.
		 */
		btree->checkpointing = WT_CKPT_OFF;
		WT_FULL_BARRIER();

		/*
		 * If this tree was being skipped by the eviction server during
		 * the checkpoint, clear the wait.
		 */
		btree->evict_walk_period = 0;

		/*
		 * Wake the eviction server, in case application threads have
		 * stalled while the eviction server decided it couldn't make
		 * progress.  Without this, application threads will be stalled
		 * until the eviction server next wakes.
		 */
		WT_TRET(__wt_evict_server_wake(session));
	}

	__wt_spin_unlock(session, &btree->flush_lock);

	/*
	 * Leaves are written before a checkpoint (or as part of a file close,
	 * before checkpointing the file).  Start a flush to stable storage,
	 * but don't wait for it.
	 */
	if (ret == 0 &&
	    syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC))
		WT_RET(btree->bm->sync(btree->bm, session, true));

	return (ret);
}
Esempio n. 15
0
/*
 * __sync_file --
 *	Flush pages for a specific file.
 */
static int
__sync_file(WT_SESSION_IMPL *session, int syncop)
{
	struct timespec end, start;
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_REF *walk;
	WT_TXN *txn;
	uint64_t internal_bytes, leaf_bytes;
	uint64_t internal_pages, leaf_pages;
	uint32_t flags;
	bool evict_reset;

	btree = S2BT(session);

	flags = WT_READ_CACHE | WT_READ_NO_GEN;
	walk = NULL;
	txn = &session->txn;

	internal_bytes = leaf_bytes = 0;
	internal_pages = leaf_pages = 0;
	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
		WT_RET(__wt_epoch(session, &start));

	switch (syncop) {
	case WT_SYNC_WRITE_LEAVES:
		/*
		 * Write all immediately available, dirty in-cache leaf pages.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.
		 */
		if (!btree->modified)
			return (0);
		__wt_spin_lock(session, &btree->flush_lock);
		if (!btree->modified) {
			__wt_spin_unlock(session, &btree->flush_lock);
			return (0);
		}

		flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
		for (walk = NULL;;) {
			WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages if nobody beat us to it.  Don't
			 * try to write the hottest pages: checkpoint will have
			 * to visit them anyway.
			 */
			page = walk->page;
			if (__wt_page_is_modified(page) &&
			    __wt_txn_visible_all(
			    session, page->modify->update_txn)) {
				if (txn->isolation == WT_ISO_READ_COMMITTED)
					__wt_txn_get_snapshot(session);
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
				WT_ERR(__wt_reconcile(session, walk, NULL, 0));
			}
		}
		break;
	case WT_SYNC_CHECKPOINT:
		/*
		 * We cannot check the tree modified flag in the case of a
		 * checkpoint, the checkpoint code has already cleared it.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.  We're holding the schema lock, but need the
		 * lower-level lock as well.
		 */
		__wt_spin_lock(session, &btree->flush_lock);

		/*
		 * When internal pages are being reconciled by checkpoint their
		 * child pages cannot disappear from underneath them or be split
		 * into them, nor can underlying blocks be freed until the block
		 * lists for the checkpoint are stable.  Set the checkpointing
		 * flag to block eviction of dirty pages until the checkpoint's
		 * internal page pass is complete, then wait for any existing
		 * eviction to complete.
		 */
		btree->checkpointing = 1;
		WT_FULL_BARRIER();

		WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
		if (evict_reset)
			__wt_evict_file_exclusive_off(session);

		/* Write all dirty in-cache pages. */
		flags |= WT_READ_NO_EVICT;
		for (walk = NULL;;) {
			/*
			 * If we have a page, and it was ever modified, track
			 * the highest transaction ID in the tree.  We do this
			 * here because we want the value after reconciling
			 * dirty pages.
			 */
			if (walk != NULL && walk->page != NULL &&
			    (mod = walk->page->modify) != NULL &&
			    WT_TXNID_LT(btree->rec_max_txn, mod->rec_max_txn))
				btree->rec_max_txn = mod->rec_max_txn;

			WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
			if (walk == NULL)
				break;

			page = walk->page;
			mod = page->modify;

			/* Skip clean pages. */
			if (!__wt_page_is_modified(page))
				continue;

			/*
			 * Write dirty pages, unless we can be sure they only
			 * became dirty after the checkpoint started.
			 *
			 * We can skip dirty pages if:
			 * (1) they are leaf pages;
			 * (2) there is a snapshot transaction active (which
			 *     is the case in ordinary application checkpoints
			 *     but not all internal cases); and
			 * (3) the first dirty update on the page is
			 *     sufficiently recent that the checkpoint
			 *     transaction would skip them.
			 *
			 * Mark the tree dirty: the checkpoint marked it clean
			 * and we can't skip future checkpoints until this page
			 * is written.
			 */
			if (!WT_PAGE_IS_INTERNAL(page) &&
			    F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
			    WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn) &&
			    mod->rec_result != WT_PM_REC_REWRITE) {
				__wt_page_modify_set(session, page);
				continue;
			}

			if (WT_PAGE_IS_INTERNAL(page)) {
				internal_bytes += page->memory_footprint;
				++internal_pages;
			} else {
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
			}
			WT_ERR(__wt_reconcile(session, walk, NULL, 0));
		}
		break;
	}

	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		WT_ERR(__wt_epoch(session, &end));
		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of internal\n\t"
		    "Took: %" PRIu64 "ms",
		    syncop == WT_SYNC_WRITE_LEAVES ?
		    "WRITE_LEAVES" : "CHECKPOINT",
		    leaf_bytes, leaf_pages, internal_bytes, internal_pages,
		    WT_TIMEDIFF(end, start) / WT_MILLION));
	}

err:	/* On error, clear any left-over tree walk. */
	if (walk != NULL)
		WT_TRET(__wt_page_release(session, walk, flags));

	if (txn->isolation == WT_ISO_READ_COMMITTED && session->ncursors == 0)
		__wt_txn_release_snapshot(session);

	if (btree->checkpointing) {
		/*
		 * Update the checkpoint generation for this handle so visible
		 * updates newer than the checkpoint can be evicted.
		 *
		 * This has to be published before eviction is enabled again,
		 * so that eviction knows that the checkpoint has completed.
		 */
		WT_PUBLISH(btree->checkpoint_gen,
		    S2C(session)->txn_global.checkpoint_gen);
		WT_STAT_FAST_DATA_SET(session,
		    btree_checkpoint_generation, btree->checkpoint_gen);

		/*
		 * Clear the checkpoint flag and push the change; not required,
		 * but publishing the change means stalled eviction gets moving
		 * as soon as possible.
		 */
		btree->checkpointing = 0;
		WT_FULL_BARRIER();

		/*
		 * If this tree was being skipped by the eviction server during
		 * the checkpoint, clear the wait.
		 */
		btree->evict_walk_period = 0;

		/*
		 * Wake the eviction server, in case application threads have
		 * stalled while the eviction server decided it couldn't make
		 * progress.  Without this, application threads will be stalled
		 * until the eviction server next wakes.
		 */
		WT_TRET(__wt_evict_server_wake(session));
	}

	__wt_spin_unlock(session, &btree->flush_lock);

	/*
	 * Leaves are written before a checkpoint (or as part of a file close,
	 * before checkpointing the file).  Start a flush to stable storage,
	 * but don't wait for it.
	 */
	if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES)
		WT_RET(btree->bm->sync(btree->bm, session, true));

	return (ret);
}
Esempio n. 16
0
/*
 * __open_verbose --
 *	Optionally output a verbose message on handle open.
 */
static inline int
__open_verbose(
    WT_SESSION_IMPL *session, const char *name, int file_type, u_int flags)
{
#ifdef HAVE_VERBOSE
	WT_DECL_RET;
	WT_DECL_ITEM(tmp);
	const char *file_type_tag, *sep;

	if (!WT_VERBOSE_ISSET(session, WT_VERB_FILEOPS))
		return (0);

	/*
	 * It's useful to track file opens when debugging platforms, take some
	 * effort to output good tracking information.
	 */

	switch (file_type) {
	case WT_FS_OPEN_FILE_TYPE_CHECKPOINT:
		file_type_tag = "checkpoint";
		break;
	case WT_FS_OPEN_FILE_TYPE_DATA:
		file_type_tag = "data";
		break;
	case WT_FS_OPEN_FILE_TYPE_DIRECTORY:
		file_type_tag = "directory";
		break;
	case WT_FS_OPEN_FILE_TYPE_LOG:
		file_type_tag = "log";
		break;
	case WT_FS_OPEN_FILE_TYPE_REGULAR:
		file_type_tag = "regular";
		break;
	default:
		file_type_tag = "unknown open type";
		break;
	}

	WT_RET(__wt_scr_alloc(session, 0, &tmp));
	sep = " (";
#define	WT_FS_OPEN_VERBOSE_FLAG(f, name)				\
	if (LF_ISSET(f)) {						\
		WT_ERR(__wt_buf_catfmt(					\
		    session, tmp, "%s%s", sep, name));			\
		sep = ", ";						\
	}

	WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_CREATE, "create");
	WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_DIRECTIO, "direct-IO");
	WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_EXCLUSIVE, "exclusive");
	WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_FIXED, "fixed");
	WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_READONLY, "readonly");

	if (tmp->size != 0)
		WT_ERR(__wt_buf_catfmt(session, tmp, ")"));

	__wt_verbose(session, WT_VERB_FILEOPS,
	    "%s: file-open: type %s%s",
	    name, file_type_tag, tmp->size == 0 ? "" : (char *)tmp->data);

err:	__wt_scr_free(session, &tmp);
	return (ret);
#else
	WT_UNUSED(session);
	WT_UNUSED(name);
	WT_UNUSED(file_type);
	WT_UNUSED(flags);
	return (0);
#endif
}
Esempio n. 17
0
File: txn.c Progetto: To4e/mongo
/*
 * __wt_txn_update_oldest --
 *	Sweep the running transactions to update the oldest ID required.
 * !!!
 * If a data-source is calling the WT_EXTENSION_API.transaction_oldest
 * method (for the oldest transaction ID not yet visible to a running
 * transaction), and then comparing that oldest ID against committed
 * transactions to see if updates for a committed transaction are still
 * visible to running transactions, the oldest transaction ID may be
 * the same as the last committed transaction ID, if the transaction
 * state wasn't refreshed after the last transaction committed.  Push
 * past the last committed transaction.
*/
void
__wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
{
	WT_CONNECTION_IMPL *conn;
	WT_SESSION_IMPL *oldest_session;
	WT_TXN_GLOBAL *txn_global;
	WT_TXN_STATE *s;
	uint64_t current_id, id, last_running, oldest_id, prev_oldest_id;
	uint32_t i, session_cnt;
	int32_t count;
	int last_running_moved;

	conn = S2C(session);
	txn_global = &conn->txn_global;

	current_id = last_running = txn_global->current;
	oldest_session = NULL;
	prev_oldest_id = txn_global->oldest_id;

	/*
	 * For pure read-only workloads, or if the update isn't forced and the
	 * oldest ID isn't too far behind, avoid scanning.
	 */
	if (prev_oldest_id == current_id ||
	    (!force && WT_TXNID_LT(current_id, prev_oldest_id + 100)))
		return;

	/*
	 * We're going to scan.  Increment the count of scanners to prevent the
	 * oldest ID from moving forwards.  Spin if the count is negative,
	 * which indicates that some thread is moving the oldest ID forwards.
	 */
	do {
		if ((count = txn_global->scan_count) < 0)
			WT_PAUSE();
	} while (count < 0 ||
	    !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1));

	/* The oldest ID cannot change until the scan count goes to zero. */
	prev_oldest_id = txn_global->oldest_id;
	current_id = oldest_id = last_running = txn_global->current;

	/* Walk the array of concurrent transactions. */
	WT_ORDERED_READ(session_cnt, conn->session_cnt);
	for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
		/*
		 * Update the oldest ID.
		 *
		 * Ignore: IDs older than the oldest ID we saw. This can happen
		 * if we race with a thread that is allocating an ID -- the ID
		 * will not be used because the thread will keep spinning until
		 * it gets a valid one.
		 */
		if ((id = s->id) != WT_TXN_NONE &&
		    WT_TXNID_LE(prev_oldest_id, id) &&
		    WT_TXNID_LT(id, last_running))
			last_running = id;

		/*
		 * !!!
		 * Note: Don't ignore snap_min values older than the previous
		 * oldest ID.  Read-uncommitted operations publish snap_min
		 * values without incrementing scan_count to protect the global
		 * table.  See the comment in __wt_txn_cursor_op for
		 * more details.
		 */
		if ((id = s->snap_min) != WT_TXN_NONE &&
		    WT_TXNID_LT(id, oldest_id)) {
			oldest_id = id;
			oldest_session = &conn->sessions[i];
		}
	}

	if (WT_TXNID_LT(last_running, oldest_id))
		oldest_id = last_running;

	/* The oldest ID can't move past any named snapshots. */
	if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE &&
	    WT_TXNID_LT(id, oldest_id))
		oldest_id = id;

	/* Update the last running ID. */
	last_running_moved =
	    WT_TXNID_LT(txn_global->last_running, last_running);

	/* Update the oldest ID. */
	if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) &&
	    WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) {
		WT_ORDERED_READ(session_cnt, conn->session_cnt);
		for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
			if ((id = s->id) != WT_TXN_NONE &&
			    WT_TXNID_LT(id, last_running))
				last_running = id;
			if ((id = s->snap_min) != WT_TXN_NONE &&
			    WT_TXNID_LT(id, oldest_id))
				oldest_id = id;
		}

		if (WT_TXNID_LT(last_running, oldest_id))
			oldest_id = last_running;

#ifdef HAVE_DIAGNOSTIC
		/*
		 * Make sure the ID doesn't move past any named snapshots.
		 *
		 * Don't include the read/assignment in the assert statement.
		 * Coverity complains if there are assignments only done in
		 * diagnostic builds, and when the read is from a volatile.
		 */
		id = txn_global->nsnap_oldest_id;
		WT_ASSERT(session,
		    id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
#endif
		if (WT_TXNID_LT(txn_global->last_running, last_running))
			txn_global->last_running = last_running;
		if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
			txn_global->oldest_id = oldest_id;
		WT_ASSERT(session, txn_global->scan_count == -1);
		txn_global->scan_count = 0;
	} else {
		if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
		    current_id - oldest_id > 10000 && last_running_moved &&
		    oldest_session != NULL) {
			(void)__wt_verbose(session, WT_VERB_TRANSACTION,
			    "old snapshot %" PRIu64
			    " pinned in session %d [%s]"
			    " with snap_min %" PRIu64 "\n",
			    oldest_id, oldest_session->id,
			    oldest_session->lastop,
			    oldest_session->txn.snap_min);
		}
		WT_ASSERT(session, txn_global->scan_count > 0);
		(void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
	}
}
Esempio n. 18
0
/*
 * __wt_rec_track --
 *	Add an object to the page's list of tracked objects.
 */
int
__wt_rec_track(WT_SESSION_IMPL *session, WT_PAGE *page,
    const uint8_t *addr, uint32_t addr_size,
    const void *data, uint32_t data_size, uint32_t flags)
{
	WT_PAGE_MODIFY *mod;
	WT_PAGE_TRACK *empty, *track;
	uint8_t *p;
	uint32_t i;

	mod = page->modify;

	/* Find an empty slot. */
	empty = NULL;
	for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i)
		if (!F_ISSET(track, WT_TRK_OBJECT)) {
			empty = track;
			break;
		}

	/* Reallocate space as necessary. */
	if (empty == NULL) {
		WT_RET(__rec_track_extend(session, page));
		empty = &mod->track[mod->track_entries - 1];
	}
	track = empty;

	/*
	 * Minor optimization: allocate a single chunk of space instead of two
	 * separate ones: be careful when it's freed.
	 */
	WT_RET(__wt_calloc_def(session, addr_size + data_size, &p));

	/*
	 * Set the just-added flag so we clean up should reconciliation fail,
	 * except for cached overflow values, which don't get discarded, even
	 * if reconciliation fails.
	 */
	track->flags = (uint8_t)flags | WT_TRK_OBJECT;
	if (!F_ISSET(track, WT_TRK_OVFL_VALUE))
		F_SET(track, WT_TRK_JUST_ADDED);
	track->addr.addr = p;
	track->addr.size = addr_size;
	memcpy(track->addr.addr, addr, addr_size);
	if (data_size) {
		p += addr_size;
		track->data = p;
		track->size = data_size;
		memcpy(track->data, data, data_size);
	}

	/*
	 * Overflow items are potentially large and on-page items remain in the
	 * tracking list until the page is evicted.  If we're tracking a lot of
	 * them, their memory might matter: increment the page and cache memory
	 * totals.   This is unlikely to matter, but it's inexpensive (unless
	 * there are lots of them, in with case I guess the memory matters).
	 *
	 * If this reconciliation were to fail, we would reasonably perform the
	 * inverse operation in __wt_rec_track_wrapup_err.  I'm not bothering
	 * with that because we'd have to crack the structure itself to figure
	 * out how much to decrement and I don't think it's worth the effort.
	 * The potential problem is repeatedly failing reconciliation of a page
	 * with a large number of overflow items, which causes the page's memory
	 * memory footprint to become incorrectly high, causing us to push the
	 * page out of cache unnecessarily.  Like I said, not worth the effort.
	 */
	if (LF_ISSET(WT_TRK_ONPAGE))
		__wt_cache_page_inmem_incr(
		    session, page, addr_size + data_size);

	if (WT_VERBOSE_ISSET(session, reconcile))
		WT_RET(__track_msg(session, page, "add", track));
	return (0);
}
Esempio n. 19
0
/*
 * __wt_lsm_merge --
 *	Merge a set of chunks of an LSM tree.
 */
int
__wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
{
	WT_BLOOM *bloom;
	WT_CURSOR *dest, *src;
	WT_DECL_RET;
	WT_ITEM key, value;
	WT_LSM_CHUNK *chunk;
	uint32_t generation;
	uint64_t insert_count, record_count;
	u_int dest_id, end_chunk, i, nchunks, start_chunk, start_id, verb;
	int tret;
	bool created_chunk, create_bloom, locked, in_sync;
	const char *cfg[3];
	const char *drop_cfg[] =
	    { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL };

	bloom = NULL;
	chunk = NULL;
	dest = src = NULL;
	start_id = 0;
	created_chunk = create_bloom = locked = in_sync = false;

	/* Fast path if it's obvious no merges could be done. */
	if (lsm_tree->nchunks < lsm_tree->merge_min &&
	    lsm_tree->merge_aggressiveness < WT_LSM_AGGRESSIVE_THRESHOLD)
		return (WT_NOTFOUND);

	/*
	 * Use the lsm_tree lock to read the chunks (so no switches occur), but
	 * avoid holding it while the merge is in progress: that may take a
	 * long time.
	 */
	WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
	locked = true;

	WT_ERR(__lsm_merge_span(session,
	    lsm_tree, id, &start_chunk, &end_chunk, &record_count));
	nchunks = (end_chunk + 1) - start_chunk;

	WT_ASSERT(session, nchunks > 0);
	start_id = lsm_tree->chunk[start_chunk]->id;

	/* Find the merge generation. */
	for (generation = 0, i = 0; i < nchunks; i++)
		generation = WT_MAX(generation,
		    lsm_tree->chunk[start_chunk + i]->generation + 1);

	WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
	locked = false;

	/* Allocate an ID for the merge. */
	dest_id = __wt_atomic_add32(&lsm_tree->last, 1);

	/*
	 * We only want to do the chunk loop if we're running with verbose,
	 * so we wrap these statements in the conditional.  Avoid the loop
	 * in the normal path.
	 */
	if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) {
		WT_ERR(__wt_verbose(session, WT_VERB_LSM,
		    "Merging %s chunks %u-%u into %u (%" PRIu64 " records)"
		    ", generation %" PRIu32,
		    lsm_tree->name,
		    start_chunk, end_chunk, dest_id, record_count, generation));
		for (verb = start_chunk; verb <= end_chunk; verb++)
			WT_ERR(__wt_verbose(session, WT_VERB_LSM,
			    "Merging %s: Chunk[%u] id %u, gen: %" PRIu32
			    ", size: %" PRIu64 ", records: %" PRIu64,
			    lsm_tree->name, verb, lsm_tree->chunk[verb]->id,
			    lsm_tree->chunk[verb]->generation,
			    lsm_tree->chunk[verb]->size,
			    lsm_tree->chunk[verb]->count));
	}

	WT_ERR(__wt_calloc_one(session, &chunk));
	created_chunk = true;
	chunk->id = dest_id;

	if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) &&
	    (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) ||
	    start_chunk > 0) && record_count > 0)
		create_bloom = true;

	/*
	 * Special setup for the merge cursor:
	 * first, reset to open the dependent cursors;
	 * then restrict the cursor to a specific number of chunks;
	 * then set MERGE so the cursor doesn't track updates to the tree.
	 */
	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
	F_SET(src, WT_CURSTD_RAW);
	WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks));

	WT_WITH_SCHEMA_LOCK(session,
	    ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
	WT_ERR(ret);
	if (create_bloom) {
		WT_ERR(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk));

		WT_ERR(__wt_bloom_create(session, chunk->bloom_uri,
		    lsm_tree->bloom_config,
		    record_count, lsm_tree->bloom_bit_count,
		    lsm_tree->bloom_hash_count, &bloom));
	}

	/* Discard pages we read as soon as we're done with them. */
	F_SET(session, WT_SESSION_NO_CACHE);

	cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor);
	cfg[1] = "bulk,raw,skip_sort_check";
	cfg[2] = NULL;
	WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));

#define	LSM_MERGE_CHECK_INTERVAL	WT_THOUSAND
	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
		if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) {
			if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
				WT_ERR(EINTR);

			WT_STAT_FAST_CONN_INCRV(session,
			    lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL);
			++lsm_tree->merge_progressing;
		}

		WT_ERR(src->get_key(src, &key));
		dest->set_key(dest, &key);
		WT_ERR(src->get_value(src, &value));
		dest->set_value(dest, &value);
		WT_ERR(dest->insert(dest));
		if (create_bloom)
			WT_ERR(__wt_bloom_insert(bloom, &key));
	}
	WT_ERR_NOTFOUND_OK(ret);

	WT_STAT_FAST_CONN_INCRV(session,
	    lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL);
	++lsm_tree->merge_progressing;
	WT_ERR(__wt_verbose(session, WT_VERB_LSM,
	    "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.",
	    record_count, insert_count));

	/*
	 * Closing and syncing the files can take a while.  Set the
	 * merge_syncing field so that compact knows it is still in
	 * progress.
	 */
	(void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1);
	in_sync = true;
	/*
	 * We've successfully created the new chunk.  Now install it.  We need
	 * to ensure that the NO_CACHE flag is cleared and the bloom filter
	 * is closed (even if a step fails), so track errors but don't return
	 * until we've cleaned up.
	 */
	WT_TRET(src->close(src));
	WT_TRET(dest->close(dest));
	src = dest = NULL;

	F_CLR(session, WT_SESSION_NO_CACHE);

	/*
	 * We're doing advisory reads to fault the new trees into cache.
	 * Don't block if the cache is full: our next unit of work may be to
	 * discard some trees to free space.
	 */
	F_SET(session, WT_SESSION_NO_EVICTION);

	if (create_bloom) {
		if (ret == 0)
			WT_TRET(__wt_bloom_finalize(bloom));

		/*
		 * Read in a key to make sure the Bloom filters btree handle is
		 * open before it becomes visible to application threads.
		 * Otherwise application threads will stall while it is opened
		 * and internal pages are read into cache.
		 */
		if (ret == 0) {
			WT_CLEAR(key);
			WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
		}

		WT_TRET(__wt_bloom_close(bloom));
		bloom = NULL;
	}
	WT_ERR(ret);

	/*
	 * Open a handle on the new chunk before application threads attempt
	 * to access it, opening it pre-loads internal pages into the file
	 * system cache.
	 */
	cfg[1] = "checkpoint=" WT_CHECKPOINT;
	WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
	WT_TRET(dest->close(dest));
	dest = NULL;
	++lsm_tree->merge_progressing;
	(void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1);
	in_sync = false;
	WT_ERR_NOTFOUND_OK(ret);

	WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
	locked = true;

	/*
	 * Check whether we raced with another merge, and adjust the chunk
	 * array offset as necessary.
	 */
	if (start_chunk >= lsm_tree->nchunks ||
	    lsm_tree->chunk[start_chunk]->id != start_id)
		for (start_chunk = 0;
		    start_chunk < lsm_tree->nchunks;
		    start_chunk++)
			if (lsm_tree->chunk[start_chunk]->id == start_id)
				break;

	/*
	 * It is safe to error out here - since the update can only fail
	 * prior to making updates to the tree.
	 */
	WT_ERR(__wt_lsm_merge_update_tree(
	    session, lsm_tree, start_chunk, nchunks, chunk));

	if (create_bloom)
		F_SET(chunk, WT_LSM_CHUNK_BLOOM);
	chunk->count = insert_count;
	chunk->generation = generation;
	F_SET(chunk, WT_LSM_CHUNK_ONDISK);

	/*
	 * We have no current way of continuing if the metadata update fails,
	 * so we will panic in that case.  Put some effort into cleaning up
	 * after ourselves here - so things have a chance of shutting down.
	 *
	 * Any errors that happened after the tree was locked are
	 * fatal - we can't guarantee the state of the tree.
	 */
	if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0)
		WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge");

	lsm_tree->dsk_gen++;

	/* Update the throttling while holding the tree lock. */
	__wt_lsm_tree_throttle(session, lsm_tree, true);

	/* Schedule a pass to discard old chunks */
	WT_ERR(__wt_lsm_manager_push_entry(
	    session, WT_LSM_WORK_DROP, 0, lsm_tree));

err:	if (locked)
		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
	if (in_sync)
		(void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1);
	if (src != NULL)
		WT_TRET(src->close(src));
	if (dest != NULL)
		WT_TRET(dest->close(dest));
	if (bloom != NULL)
		WT_TRET(__wt_bloom_close(bloom));
	if (ret != 0 && created_chunk) {
		/* Drop the newly-created files on error. */
		if (chunk->uri != NULL) {
			WT_WITH_SCHEMA_LOCK(session, tret =
			    __wt_schema_drop(session, chunk->uri, drop_cfg));
			WT_TRET(tret);
		}
		if (create_bloom && chunk->bloom_uri != NULL) {
			WT_WITH_SCHEMA_LOCK(session,
			    tret = __wt_schema_drop(
			    session, chunk->bloom_uri, drop_cfg));
			WT_TRET(tret);
		}
		__wt_free(session, chunk->bloom_uri);
		__wt_free(session, chunk->uri);
		__wt_free(session, chunk);

		if (ret == EINTR)
			WT_TRET(__wt_verbose(session, WT_VERB_LSM,
			    "Merge aborted due to close"));
		else
			WT_TRET(__wt_verbose(session, WT_VERB_LSM,
			    "Merge failed with %s",
			   __wt_strerror(session, ret, NULL, 0)));
	}
	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
	return (ret);
}
Esempio n. 20
0
/*
 * __wt_block_checkpoint_load --
 *	Load a checkpoint.
 */
int
__wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
    const uint8_t *addr, size_t addr_size,
    uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint)
{
	WT_BLOCK_CKPT *ci, _ci;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint8_t *endp;

	ci = NULL;

	/*
	 * Sometimes we don't find a root page (we weren't given a checkpoint,
	 * or the checkpoint was empty).  In that case we return an empty root
	 * address, set that up now.
	 */
	*root_addr_sizep = 0;

#ifdef HAVE_VERBOSE
	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		if (addr != NULL) {
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__ckpt_string(session, block, addr, tmp));
		}
		__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "%s: load-checkpoint: %s", block->name,
		    addr == NULL ? "[Empty]" : (const char *)tmp->data);
	}
#endif

	/*
	 * There's a single checkpoint in the file that can be written, all of
	 * the others are read-only.  We use the same initialization calls for
	 * readonly checkpoints, but the information doesn't persist.
	 */
	if (checkpoint) {
		ci = &_ci;
		WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
	} else {
		/*
		 * We depend on the btree level for locking: things will go bad
		 * fast if we open the live system in two handles, or salvage,
		 * truncate or verify the live/running file.
		 */
#ifdef HAVE_DIAGNOSTIC
		__wt_spin_lock(session, &block->live_lock);
		WT_ASSERT(session, block->live_open == false);
		block->live_open = true;
		__wt_spin_unlock(session, &block->live_lock);
#endif
		ci = &block->live;
		WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
	}

	/*
	 * If the checkpoint has an on-disk root page, load it.  Otherwise, size
	 * the file past the description information.
	 */
	if (addr == NULL || addr_size == 0)
		ci->file_size = block->allocsize;
	else {
		/* Crack the checkpoint cookie. */
		WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci));

		/* Verify sets up next. */
		if (block->verify)
			WT_ERR(__wt_verify_ckpt_load(session, block, ci));

		/* Read any root page. */
		if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
			endp = root_addr;
			WT_ERR(__wt_block_addr_to_buffer(block, &endp,
			    ci->root_offset, ci->root_size, ci->root_checksum));
			*root_addr_sizep = WT_PTRDIFF(endp, root_addr);
		}

		/*
		 * Rolling a checkpoint forward requires the avail list, the
		 * blocks from which we can allocate.
		 */
		if (!checkpoint)
			WT_ERR(__wt_block_extlist_read_avail(
			    session, block, &ci->avail, ci->file_size));
	}

	/*
	 * If the checkpoint can be written, that means anything written after
	 * the checkpoint is no longer interesting, truncate the file.  Don't
	 * bother checking the avail list for a block at the end of the file,
	 * that was done when the checkpoint was first written (re-writing the
	 * checkpoint might possibly make it relevant here, but it's unlikely
	 * enough I don't bother).
	 */
	if (!checkpoint)
		WT_ERR(__wt_block_truncate(session, block, ci->file_size));

	if (0) {
err:		/*
		 * Don't call checkpoint-unload: unload does real work including
		 * file truncation.  If we fail early enough that the checkpoint
		 * information isn't correct, bad things would happen.  The only
		 * allocated memory was in the service of verify, clean that up.
		 */
		if (block->verify)
			WT_TRET(__wt_verify_ckpt_unload(session, block));
	}

	/* Checkpoints don't need the original information, discard it. */
	if (checkpoint && ci != NULL)
		__wt_block_ckpt_destroy(session, ci);

	__wt_scr_free(session, &tmp);
	return (ret);
}
Esempio n. 21
0
/*
 * __ckpt_process --
 *	Process the list of checkpoints.
 */
static int
__ckpt_process(
    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
{
	WT_BLOCK_CKPT *a, *b, *ci;
	WT_CKPT *ckpt, *next_ckpt;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint64_t ckpt_size;
	int deleting, locked;

	ci = &block->live;
	locked = 0;

	/*
	 * We've allocated our last page, update the checkpoint size.  We need
	 * to calculate the live system's checkpoint size before reading and
	 * merging checkpoint allocation and discard information from the
	 * checkpoints we're deleting, those operations change the underlying
	 * byte counts.
	 */
	ckpt_size = ci->ckpt_size;
	ckpt_size += ci->alloc.bytes;
	ckpt_size -= ci->discard.bytes;

	/*
	 * Extents newly available as a result of deleting previous checkpoints
	 * are added to a list of extents.  The list should be empty, but there
	 * is no explicit "free the checkpoint information" call into the block
	 * manager; if there was an error in an upper level resulting in some
	 * previous checkpoint never being resolved, the list may not be empty.
	 *
	 * XXX
	 * This isn't sufficient, actually: we're going to leak all the blocks
	 * written as part of the last checkpoint because it was never resolved.
	 */
	__wt_block_extlist_free(session, &ci->ckpt_avail);
	WT_RET(__wt_block_extlist_init(
	    session, &ci->ckpt_avail, "live", "ckpt_avail"));

	/*
	 * To delete a checkpoint, we'll need checkpoint information for it and
	 * the subsequent checkpoint into which it gets rolled; read them from
	 * disk before we lock things down.
	 */
	deleting = 0;
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;
		deleting = 1;

		/*
		 * Read the checkpoint and next checkpoint extent lists if we
		 * haven't already read them (we may have already read these
		 * extent blocks if there is more than one deleted checkpoint).
		 */
		if (ckpt->bpriv == NULL)
			WT_ERR(__ckpt_extlist_read(session, block, ckpt));

		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * The "next" checkpoint may be the live tree which has no
		 * extent blocks to read.
		 */
		if (next_ckpt->bpriv == NULL &&
		    !F_ISSET(next_ckpt, WT_CKPT_ADD))
			WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
	}

	/*
	 * Hold a lock so the live extent lists and the file size can't change
	 * underneath us.  I suspect we'll tighten this if checkpoints take too
	 * much time away from real work: we read the historic checkpoint
	 * information without a lock, but we could also merge and re-write the
	 * delete checkpoint information without a lock, except for ranges
	 * merged into the live tree.
	 */
	__wt_spin_lock(session, &block->live_lock);
	locked = 1;

	/* Skip the additional processing if we aren't deleting checkpoints. */
	if (!deleting)
		goto live_update;

	/*
	 * Delete any no-longer-needed checkpoints: we do this first as it frees
	 * blocks to the live lists, and the freed blocks will then be included
	 * when writing the live extent lists.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;

		if (WT_VERBOSE_ISSET(session, ckpt)) {
			if (tmp == NULL)
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__ckpt_string(
			    session, block, ckpt->raw.data, tmp));
			WT_VERBOSE_ERR(session, ckpt,
			    "%s: delete-checkpoint: %s: %s",
			    block->name, ckpt->name, (char *)tmp->data);
		}

		/*
		 * Find the checkpoint into which we'll roll this checkpoint's
		 * blocks: it's the next real checkpoint in the list, and it
		 * better have been read in (if it's not the add slot).
		 */
		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * Set the from/to checkpoint structures, where the "to" value
		 * may be the live tree.
		 */
		a = ckpt->bpriv;
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			b = &block->live;
		else
			b = next_ckpt->bpriv;

		/*
		 * Free the root page: there's nothing special about this free,
		 * the root page is allocated using normal rules, that is, it
		 * may have been taken from the avail list, and was entered on
		 * the live system's alloc list at that time.  We free it into
		 * the checkpoint's discard list, however, not the live system's
		 * list because it appears on the checkpoint's alloc list and so
		 * must be paired in the checkpoint.
		 */
		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
			WT_ERR(__wt_block_insert_ext(session,
			    &a->discard, a->root_offset, a->root_size));

		/*
		 * Free the blocks used to hold the "from" checkpoint's extent
		 * lists, including the avail list.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));

		/*
		 * Roll the "from" alloc and discard extent lists into the "to"
		 * checkpoint's lists.
		 */
		if (a->alloc.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, &a->alloc, &b->alloc));
		if (a->discard.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, &a->discard, &b->discard));

		/*
		 * If the "to" checkpoint is also being deleted, we're done with
		 * it, it's merged into some other checkpoint in the next loop.
		 * This means the extent lists may aggregate over a number of
		 * checkpoints, but that's OK, they're disjoint sets of ranges.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
			continue;

		/*
		 * Find blocks for re-use: wherever the "to" checkpoint's
		 * allocate and discard lists overlap, move the range to
		 * the live system's checkpoint available list.
		 */
		WT_ERR(__wt_block_extlist_overlap(session, block, b));

		/*
		 * If we're updating the live system's information, we're done.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			continue;

		/*
		 * We have to write the "to" checkpoint's extent lists out in
		 * new blocks, and update its cookie.
		 *
		 * Free the blocks used to hold the "to" checkpoint's extent
		 * lists; don't include the avail list, it's not changing.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));

		F_SET(next_ckpt, WT_CKPT_UPDATE);
	}

	/* Update checkpoints marked for update. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_UPDATE)) {
			WT_ASSERT(session, !F_ISSET(ckpt, WT_CKPT_ADD));
			WT_ERR(__ckpt_update(
			    session, block, ckpt, ckpt->bpriv, 0, 0));
		}

live_update:
	ci = &block->live;

	/* Truncate the file if that's possible. */
	WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));

	/* Update the final, added checkpoint based on the live system. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_ADD)) {
			WT_ERR(__ckpt_update(
			    session, block, ckpt, ci, ckpt_size, 1));

			/*
			 * XXX
			 * Our caller wants the final checkpoint size.  Setting
			 * the size here violates layering, but the alternative
			 * is a call for the btree layer to crack the checkpoint
			 * cookie into its components, and that's a fair amount
			 * of work.
			 */
			ckpt->ckpt_size = ci->ckpt_size;
		}

	/*
	 * Reset the live system's alloc and discard extent lists, leave the
	 * avail list alone.
	 */
	__wt_block_extlist_free(session, &ci->alloc);
	WT_ERR(__wt_block_extlist_init(session, &ci->alloc, "live", "alloc"));
	__wt_block_extlist_free(session, &ci->discard);
	WT_ERR(
	    __wt_block_extlist_init(session, &ci->discard, "live", "discard"));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * The first checkpoint in the system should always have an empty
	 * discard list.  If we've read that checkpoint and/or created it,
	 * check.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (!F_ISSET(ckpt, WT_CKPT_DELETE))
			break;
	if ((a = ckpt->bpriv) == NULL)
		a = &block->live;
	if (a->discard.entries != 0) {
		__wt_errx(session,
		    "first checkpoint incorrectly has blocks on the discard "
		    "list");
		WT_ERR(WT_ERROR);
	}
#endif

err:	if (locked)
		__wt_spin_unlock(session, &block->live_lock);

	/* Discard any checkpoint information we loaded. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if ((ci = ckpt->bpriv) != NULL)
			__wt_block_ckpt_destroy(session, ci);

	__wt_scr_free(&tmp);
	return (ret);
}
Esempio n. 22
0
/*
 * __sync_file --
 *	Flush pages for a specific file.
 */
static int
__sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
{
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_REF *prev, *walk;
	WT_TXN *txn;
	uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
	uint64_t oldest_id, saved_pinned_id, time_start, time_stop;
	uint32_t flags;
	bool timer, tried_eviction;

	conn = S2C(session);
	btree = S2BT(session);
	prev = walk = NULL;
	txn = &session->txn;
	tried_eviction = false;
	time_start = time_stop = 0;

	/* Only visit pages in cache and don't bump page read generations. */
	flags = WT_READ_CACHE | WT_READ_NO_GEN;

	/*
	 * Skip all deleted pages.  For a page to be marked deleted, it must
	 * have been evicted from cache and marked clean.  Checkpoint should
	 * never instantiate deleted pages: if a truncate is not visible to the
	 * checkpoint, the on-disk version is correct.  If the truncate is
	 * visible, we skip over the child page when writing its parent.  We
	 * check whether a truncate is visible in the checkpoint as part of
	 * reconciling internal pages (specifically in __rec_child_modify).
	 */
	LF_SET(WT_READ_DELETED_SKIP);

	internal_bytes = leaf_bytes = 0;
	internal_pages = leaf_pages = 0;
	saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
	timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT);
	if (timer)
		time_start = __wt_clock(session);

	switch (syncop) {
	case WT_SYNC_WRITE_LEAVES:
		/*
		 * Write all immediately available, dirty in-cache leaf pages.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.
		 */
		if (!btree->modified)
			return (0);
		__wt_spin_lock(session, &btree->flush_lock);
		if (!btree->modified) {
			__wt_spin_unlock(session, &btree->flush_lock);
			return (0);
		}

		/*
		 * Save the oldest transaction ID we need to keep around.
		 * Otherwise, in a busy system, we could be updating pages so
		 * fast that write leaves never catches up.  We deliberately
		 * have no transaction running at this point that would keep
		 * the oldest ID from moving forwards as we walk the tree.
		 */
		oldest_id = __wt_txn_oldest_id(session);

		LF_SET(WT_READ_NO_WAIT | WT_READ_SKIP_INTL);
		for (;;) {
			WT_ERR(__wt_tree_walk(session, &walk, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages if nobody beat us to it.  Don't
			 * try to write hot pages (defined as pages that have
			 * been updated since the write phase leaves started):
			 * checkpoint will have to visit them anyway.
			 */
			page = walk->page;
			if (__wt_page_is_modified(page) &&
			    WT_TXNID_LT(page->modify->update_txn, oldest_id)) {
				if (txn->isolation == WT_ISO_READ_COMMITTED)
					__wt_txn_get_snapshot(session);
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
				WT_ERR(__wt_reconcile(session,
				    walk, NULL, WT_REC_CHECKPOINT, NULL));
			}
		}
		break;
	case WT_SYNC_CHECKPOINT:
		/*
		 * If we are flushing a file at read-committed isolation, which
		 * is of particular interest for flushing the metadata to make
		 * a schema-changing operation durable, get a transactional
		 * snapshot now.
		 *
		 * All changes committed up to this point should be included.
		 * We don't update the snapshot in between pages because the
		 * metadata shouldn't have many pages.  Instead, read-committed
		 * isolation ensures that all metadata updates completed before
		 * the checkpoint are included.
		 */
		if (txn->isolation == WT_ISO_READ_COMMITTED)
			__wt_txn_get_snapshot(session);

		/*
		 * We cannot check the tree modified flag in the case of a
		 * checkpoint, the checkpoint code has already cleared it.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.  We're holding the schema lock, but need the
		 * lower-level lock as well.
		 */
		__wt_spin_lock(session, &btree->flush_lock);

		/*
		 * In the final checkpoint pass, child pages cannot be evicted
		 * from underneath internal pages nor can underlying blocks be
		 * freed until the checkpoint's block lists are stable. Also,
		 * we cannot split child pages into parents unless we know the
		 * final pass will write a consistent view of that namespace.
		 * Set the checkpointing flag to block such actions and wait for
		 * any problematic eviction or page splits to complete.
		 */
		WT_ASSERT(session, btree->syncing == WT_BTREE_SYNC_OFF &&
		    btree->sync_session == NULL);

		btree->sync_session = session;
		btree->syncing = WT_BTREE_SYNC_WAIT;
		(void)__wt_gen_next_drain(session, WT_GEN_EVICT);
		btree->syncing = WT_BTREE_SYNC_RUNNING;

		/* Write all dirty in-cache pages. */
		LF_SET(WT_READ_NO_EVICT);

		/* Read pages with lookaside entries and evict them asap. */
		LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED);

		for (;;) {
			WT_ERR(__sync_dup_walk(session, walk, flags, &prev));
			WT_ERR(__wt_tree_walk(session, &walk, flags));

			if (walk == NULL)
				break;

			/*
			 * Skip clean pages, but need to make sure maximum
			 * transaction ID is always updated.
			 */
			if (!__wt_page_is_modified(walk->page)) {
				if (((mod = walk->page->modify) != NULL) &&
				    mod->rec_max_txn > btree->rec_max_txn)
					btree->rec_max_txn = mod->rec_max_txn;
				if (mod != NULL &&
				    btree->rec_max_timestamp <
				    mod->rec_max_timestamp)
					btree->rec_max_timestamp =
					    mod->rec_max_timestamp;
				continue;
			}

			/*
			 * Take a local reference to the page modify structure
			 * now that we know the page is dirty. It needs to be
			 * done in this order otherwise the page modify
			 * structure could have been created between taking the
			 * reference and checking modified.
			 */
			page = walk->page;

			/*
			 * Write dirty pages, if we can't skip them. If we skip
			 * a page, mark the tree dirty. The checkpoint marked it
			 * clean and we can't skip future checkpoints until this
			 * page is written.
			 */
			if (__sync_checkpoint_can_skip(session, page)) {
				__wt_tree_modify_set(session);
				continue;
			}

			if (WT_PAGE_IS_INTERNAL(page)) {
				internal_bytes += page->memory_footprint;
				++internal_pages;
			} else {
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
			}

			/*
			 * If the page was pulled into cache by our read, try
			 * to evict it now.
			 *
			 * For eviction to have a chance, we first need to move
			 * the walk point to the next page checkpoint will
			 * visit.  We want to avoid this code being too special
			 * purpose, so try to reuse the ordinary eviction path.
			 *
			 * Regardless of whether eviction succeeds or fails,
			 * the walk continues from the previous location.  We
			 * remember whether we tried eviction, and don't try
			 * again.  Even if eviction fails (the page may stay in
			 * cache clean but with history that cannot be
			 * discarded), that is not wasted effort because
			 * checkpoint doesn't need to write the page again.
			 */
			if (!WT_PAGE_IS_INTERNAL(page) &&
			    page->read_gen == WT_READGEN_WONT_NEED &&
			    !tried_eviction) {
				WT_ERR_BUSY_OK(
				    __wt_page_release_evict(session, walk));
				walk = prev;
				prev = NULL;
				tried_eviction = true;
				continue;
			}
			tried_eviction = false;

			WT_ERR(__wt_reconcile(
			    session, walk, NULL, WT_REC_CHECKPOINT, NULL));

			/*
			 * Update checkpoint IO tracking data if configured
			 * to log verbose progress messages.
			 */
			if (conn->ckpt_timer_start.tv_sec > 0) {
				conn->ckpt_write_bytes +=
				    page->memory_footprint;
				++conn->ckpt_write_pages;

				/* Periodically log checkpoint progress. */
				if (conn->ckpt_write_pages % 5000 == 0)
					__wt_checkpoint_progress(
					    session, false);
			}
		}
		break;
	case WT_SYNC_CLOSE:
	case WT_SYNC_DISCARD:
		WT_ERR(__wt_illegal_value(session, syncop));
		break;
	}

	if (timer) {
		time_stop = __wt_clock(session);
		__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "__sync_file WT_SYNC_%s wrote: %" PRIu64
		    " leaf pages (%" PRIu64 "B), %" PRIu64
		    " internal pages (%" PRIu64 "B), and took %" PRIu64 "ms",
		    syncop == WT_SYNC_WRITE_LEAVES ?
		    "WRITE_LEAVES" : "CHECKPOINT",
		    leaf_pages, leaf_bytes, internal_pages, internal_bytes,
		    WT_CLOCKDIFF_MS(time_stop, time_start));
	}

err:	/* On error, clear any left-over tree walk. */
	WT_TRET(__wt_page_release(session, walk, flags));
	WT_TRET(__wt_page_release(session, prev, flags));

	/*
	 * If we got a snapshot in order to write pages, and there was no
	 * snapshot active when we started, release it.
	 */
	if (txn->isolation == WT_ISO_READ_COMMITTED &&
	    saved_pinned_id == WT_TXN_NONE)
		__wt_txn_release_snapshot(session);

	/* Clear the checkpoint flag. */
	btree->syncing = WT_BTREE_SYNC_OFF;
	btree->sync_session = NULL;

	__wt_spin_unlock(session, &btree->flush_lock);

	/*
	 * Leaves are written before a checkpoint (or as part of a file close,
	 * before checkpointing the file).  Start a flush to stable storage,
	 * but don't wait for it.
	 */
	if (ret == 0 &&
	    syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC))
		WT_RET(btree->bm->sync(btree->bm, session, false));

	return (ret);
}
Esempio n. 23
0
/*
 * __snapshot_process --
 *	Process the list of snapshots.
 */
static int
__snapshot_process(
    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase)
{
	WT_BLOCK_SNAPSHOT *a, *b, *si;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_SNAPSHOT *snap;
	uint64_t snapshot_size;
	int deleting, locked;

	si = &block->live;
	locked = 0;

	/*
	 * We've allocated our last page, update the snapshot size.  We need to
	 * calculate the live system's snapshot size before reading and merging
	 * snapshot allocation and discard information from the snapshots we're
	 * deleting, those operations will change the underlying byte counts.
	 */
	snapshot_size = si->snapshot_size;
	snapshot_size += si->alloc.bytes;
	snapshot_size -= si->discard.bytes;

	/*
	 * Extents that become newly available as a result of deleting previous
	 * snapshots are added to a list of extents.  The list should be empty,
	 * but there's no explicit "free the snapshot information" call into the
	 * block manager; if there was an error in an upper level resulting in
	 * the snapshot never being "resolved", the list might not be empty.
	 *
	 * XXX
	 * This isn't sufficient, actually: we're going to leak all the blocks
	 * that were written as part of the last snapshot because it was never
	 * resolved.
	 */
	__wt_block_extlist_free(session, &si->snapshot_avail);
	WT_RET(__wt_block_extlist_init(
	    session, &si->snapshot_avail, "live", "snapshot_avail"));

	/*
	 * To delete a snapshot, we'll need snapshot information for it, and we
	 * have to read that from the disk.
	 */
	deleting = 0;
	WT_SNAPSHOT_FOREACH(snapbase, snap) {
		/*
		 * To delete a snapshot, we'll need snapshot information for it
		 * and the subsequent snapshot.  The test is tricky, we have to
		 * load the current snapshot's information if it's marked for
		 * deletion, or if it follows a snapshot marked for deletion,
		 * where the boundary cases are the first snapshot in the list
		 * and the last snapshot in the list: if we're deleting the last
		 * snapshot in the list, there's no next snapshot, the snapshot
		 * will be merged into the live tree.
		 */
		if (!F_ISSET(snap, WT_SNAP_DELETE) &&
		    (snap == snapbase ||
		    F_ISSET(snap, WT_SNAP_ADD) ||
		    !F_ISSET(snap - 1, WT_SNAP_DELETE)))
			continue;
		deleting = 1;

		/*
		 * Allocate a snapshot structure, crack the cookie and read the
		 * snapshot's extent lists.
		 *
		 * Ignore the avail list: snapshot avail lists are only useful
		 * if we are rolling forward from the particular snapshot and
		 * they represent our best understanding of what blocks can be
		 * allocated.  If we are not operating on the live snapshot,
		 * subsequent snapshots might have allocated those blocks, and
		 * the avail list is useless.  We don't discard it, because it
		 * is useful as part of verification, but we don't re-write it
		 * either.
		 */
		WT_ERR(__wt_calloc(
		    session, 1, sizeof(WT_BLOCK_SNAPSHOT), &snap->bpriv));
		si = snap->bpriv;
		WT_ERR(__wt_block_snap_init(session, block, si, snap->name, 0));
		WT_ERR(__wt_block_buffer_to_snapshot(
		    session, block, snap->raw.data, si));
		WT_ERR(__wt_block_extlist_read(session, block, &si->alloc));
		WT_ERR(__wt_block_extlist_read(session, block, &si->discard));
	}

	/*
	 * Hold a lock so the live extent lists and the file size can't change
	 * underneath us.  I suspect we'll tighten this if snapshots take too
	 * much time away from real work: we read historic snapshot information
	 * without a lock, but we could also merge and re-write the delete
	 * snapshot information without a lock, except for ranges merged into
	 * the live tree.
	 */
	__wt_spin_lock(session, &block->live_lock);
	locked = 1;

	/* Skip the additional processing if we aren't deleting snapshots. */
	if (!deleting)
		goto live_update;

	/*
	 * Delete any no-longer-needed snapshots: we do this first as it frees
	 * blocks to the live lists, and the freed blocks will then be included
	 * when writing the live extent lists.
	 */
	WT_SNAPSHOT_FOREACH(snapbase, snap) {
		if (!F_ISSET(snap, WT_SNAP_DELETE))
			continue;

		if (WT_VERBOSE_ISSET(session, snapshot)) {
			if (tmp == NULL)
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__snapshot_string(
			    session, block, snap->raw.data, tmp));
			WT_VERBOSE_ERR(session, snapshot,
			    "%s: delete-snapshot: %s: %s",
			    block->name, snap->name, (char *)tmp->data);
		}

		/*
		 * Set the from/to snapshot structures, where the "to" value
		 * may be the live tree.
		 */
		a = snap->bpriv;
		if (F_ISSET(snap + 1, WT_SNAP_ADD))
			b = &block->live;
		else
			b = (snap + 1)->bpriv;

		/*
		 * Free the root page: there's nothing special about this free,
		 * the root page is allocated using normal rules, that is, it
		 * may have been taken from the avail list, and was entered on
		 * the live system's alloc list at that time.  We free it into
		 * the snapshot's discard list, however, not the live system's
		 * list because it appears on the snapshot's alloc list and so
		 * must be paired in the snapshot.
		 */
		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
			WT_ERR(__wt_block_insert_ext(session,
			    &a->discard, a->root_offset, a->root_size));

		/*
		 * Free the blocks used to hold the "from" snapshot's extent
		 * lists directly to the live system's avail list, they were
		 * never on any alloc list.   Include the "from" snapshot's
		 * avail list, it's going away.
		 */
		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->alloc));
		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->avail));
		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->discard));

		/*
		 * Roll the "from" alloc and discard extent lists into the "to"
		 * snapshot's lists.
		 */
		if (a->alloc.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, &a->alloc, &b->alloc));
		if (a->discard.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, &a->discard, &b->discard));

		/*
		 * If the "to" snapshot is also being deleted, we're done with
		 * it, it's merged into some other snapshot in the next loop.
		 * This means the extent lists may aggregate over a number of
		 * snapshots, but that's OK, they're disjoint sets of ranges.
		 */
		if (F_ISSET(snap + 1, WT_SNAP_DELETE))
			continue;

		/*
		 * Find blocks for re-use: wherever the "to" snapshot's allocate
		 * and discard lists overlap is fair game, move ranges appearing
		 * on both lists to the live snapshot's newly available list.
		 */
		WT_ERR(__wt_block_extlist_overlap(session, block, b));

		/*
		 * If we're updating the live system's information, we're done.
		 */
		if (F_ISSET(snap + 1, WT_SNAP_ADD))
			continue;

		/*
		 * We have to write the "to" snapshot's extent lists out in new
		 * blocks, and update its cookie.
		 *
		 * Free the blocks used to hold the "to" snapshot's extent lists
		 * directly to the live system's avail list, they were never on
		 * any alloc list.  Do not include the "to" snapshot's avail
		 * list, it's not changing.
		 */
		WT_ERR(__snapshot_extlist_fblocks(session, block, &b->alloc));
		WT_ERR(__snapshot_extlist_fblocks(session, block, &b->discard));

		F_SET(snap + 1, WT_SNAP_UPDATE);
	}

	/* Update snapshots marked for update. */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if (F_ISSET(snap, WT_SNAP_UPDATE)) {
			WT_ASSERT(session, !F_ISSET(snap, WT_SNAP_ADD));
			WT_ERR(__snapshot_update(
			    session, block, snap, snap->bpriv, 0, 0));
		}

live_update:
	si = &block->live;

	/* Truncate the file if that's possible. */
	WT_ERR(__wt_block_extlist_truncate(session, block, &si->avail));

	/* Update the final, added snapshot based on the live system. */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if (F_ISSET(snap, WT_SNAP_ADD)) {
			WT_ERR(__snapshot_update(
			    session, block, snap, si, snapshot_size, 1));

			/*
			 * XXX
			 * Our caller wants two pieces of information: the time
			 * the snapshot was taken and the final snapshot size.
			 * This violates layering but the alternative is a call
			 * for the btree layer to crack the snapshot cookie into
			 * its components, and that's a fair amount of work.
			 * (We could just read the system time in the session
			 * layer when updating the metadata file, but that won't
			 * work for the snapshot size, and so we do both here.)
			 */
			snap->snapshot_size = si->snapshot_size;
			WT_ERR(__wt_epoch(session, &snap->sec, NULL));
		}

	/*
	 * Reset the live system's alloc and discard extent lists, leave the
	 * avail list alone.
	 */
	__wt_block_extlist_free(session, &si->alloc);
	WT_ERR(__wt_block_extlist_init(session, &si->alloc, "live", "alloc"));
	__wt_block_extlist_free(session, &si->discard);
	WT_ERR(
	    __wt_block_extlist_init(session, &si->discard, "live", "discard"));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * The first snapshot in the system should always have an empty discard
	 * list.  If we've read that snapshot and/or created it, check.
	 */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if (!F_ISSET(snap, WT_SNAP_DELETE))
			break;
	if ((a = snap->bpriv) == NULL)
		a = &block->live;
	if (a->discard.entries != 0) {
		__wt_errx(session,
		    "snapshot incorrectly has blocks on the discard list");
		WT_ERR(WT_ERROR);
	}
#endif

err:	if (locked)
		__wt_spin_unlock(session, &block->live_lock);

	/* Discard any snapshot information we loaded, we no longer need it. */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if ((si = snap->bpriv) != NULL) {
			__wt_block_extlist_free(session, &si->alloc);
			__wt_block_extlist_free(session, &si->avail);
			__wt_block_extlist_free(session, &si->discard);
		}

	__wt_scr_free(&tmp);
	return (ret);
}
Esempio n. 24
0
/*
 * __snapshot_update --
 *	Update a snapshot.
 */
static int
__snapshot_update(
    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snap,
    WT_BLOCK_SNAPSHOT *si, uint64_t snapshot_size, int is_live)
{
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint8_t *endp;

#ifdef HAVE_DIAGNOSTIC
	/* Check the extent list combinations for overlaps. */
	WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->avail));
	WT_RET(__wt_block_extlist_check(session, &si->discard, &si->avail));
	WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->discard));
#endif
	/*
	 * Write the snapshot's extent lists; we only write an avail list for
	 * the live system, other snapshot's avail lists are static and never
	 * change.  When we do write the avail list for the live system it's
	 * two lists: the current avail list plus the list of blocks that are
	 * being made available as of the new snapshot.  We can't merge that
	 * second list into the real list yet, it's not truly available until
	 * the new snapshot location has been saved to the metadata.
	 */
	WT_RET(__wt_block_extlist_write(session, block, &si->alloc, NULL));
	if (is_live)
		WT_RET(__wt_block_extlist_write(
		    session, block, &si->avail, &si->snapshot_avail));
	WT_RET(__wt_block_extlist_write(session, block, &si->discard, NULL));

	/*
	 * Set the file size for the live system.
	 *
	 * XXX
	 * We do NOT set the file size when re-writing snapshots because we want
	 * to test the snapshot's blocks against a reasonable maximum file size
	 * during verification.  This is not good: imagine a snapshot appearing
	 * early in the file, re-written, and then the snapshot requires blocks
	 * at the end of the file, blocks after the listed file size.  If the
	 * application opens that snapshot for writing (discarding subsequent
	 * snapshots), we would truncate the file to the early chunk, discarding
	 * the re-written snapshot information.  The alternative, updating the
	 * file size has its own problems, in that case we'd work correctly, but
	 * we'd lose all of the blocks between the original snapshot and the
	 * re-written snapshot.  Currently, there's no API to roll-forward
	 * intermediate snapshots, if there ever is, this will need to be fixed.
	 */
	if (is_live)
		WT_RET(__wt_filesize(session, block->fh, &si->file_size));

	/* Set the snapshot size for the live system. */
	if (is_live)
		si->snapshot_size = snapshot_size;

	/*
	 * Copy the snapshot information into the snapshot array's address
	 * cookie.
	 */
	WT_RET(__wt_buf_init(session, &snap->raw, WT_BTREE_MAX_ADDR_COOKIE));
	endp = snap->raw.mem;
	WT_RET(__wt_block_snapshot_to_buffer(session, block, &endp, si));
	snap->raw.size = WT_PTRDIFF32(endp, snap->raw.mem);

	if (WT_VERBOSE_ISSET(session, snapshot)) {
		WT_RET(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(__snapshot_string(session, block, snap->raw.data, tmp));
		WT_VERBOSE_ERR(session, snapshot,
		    "%s: create-snapshot: %s: %s",
		    block->name, snap->name, (char *)tmp->data);
	}

err:	__wt_scr_free(&tmp);
	return (ret);
}
Esempio n. 25
0
/*
 * __wt_block_snapshot_load --
 *	Load a snapshot.
 */
int
__wt_block_snapshot_load(WT_SESSION_IMPL *session,
    WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size,
    int readonly)
{
	WT_BLOCK_SNAPSHOT *si;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;

	WT_UNUSED(addr_size);

	/*
	 * Sometimes we don't find a root page (we weren't given a snapshot,
	 * or the referenced snapshot was empty).  In that case we return a
	 * root page size of 0.  Set that up now.
	 */
	dsk->size = 0;

	si = &block->live;
	WT_RET(__wt_block_snap_init(session, block, si, "live", 1));

	if (WT_VERBOSE_ISSET(session, snapshot)) {
		if (addr != NULL) {
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__snapshot_string(session, block, addr, tmp));
		}
		WT_VERBOSE_ERR(session, snapshot,
		    "%s: load-snapshot: %s", block->name,
		    addr == NULL ? "[Empty]" : (char *)tmp->data);
	}

	/* If not loading a snapshot from disk, we're done. */
	if (addr == NULL || addr_size == 0)
		return (0);

	/* Crack the snapshot cookie. */
	if (addr != NULL)
		WT_ERR(__wt_block_buffer_to_snapshot(session, block, addr, si));

	/* Verify sets up next. */
	if (block->verify)
		WT_ERR(__wt_verify_snap_load(session, block, si));

	/* Read, and optionally verify, any root page. */
	if (si->root_offset != WT_BLOCK_INVALID_OFFSET) {
		WT_ERR(__wt_block_read_off(session, block,
		    dsk, si->root_offset, si->root_size, si->root_cksum));
		if (block->verify) {
			if (tmp == NULL) {
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
				WT_ERR(__snapshot_string(
				    session, block, addr, tmp));
			}
			WT_ERR(
			    __wt_verify_dsk(session, (char *)tmp->data, dsk));
		}
	}

	/*
	 * Rolling a snapshot forward requires the avail list, the blocks from
	 * which we can allocate.
	 */
	if (!readonly)
		WT_ERR(__wt_block_extlist_read(session, block, &si->avail));

	/*
	 * If the snapshot can be written, that means anything written after
	 * the snapshot is no longer interesting.  Truncate the file.
	 */
	if (!readonly) {
		WT_VERBOSE_ERR(session, snapshot,
		    "truncate file to %" PRIuMAX, (uintmax_t)si->file_size);
		WT_ERR(__wt_ftruncate(session, block->fh, si->file_size));
	}

	if (0) {
err:		(void)__wt_block_snapshot_unload(session, block);
	}

	__wt_scr_free(&tmp);
	return (ret);
}
Esempio n. 26
0
/*
 * __ovfl_reuse_wrapup --
 *	Resolve the page's overflow reuse list after a page is written.
 */
static int
__ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_BM *bm;
	WT_OVFL_REUSE **e, **head, *reuse;
	size_t decr;
	int i;

	bm = S2BT(session)->bm;
	head = page->modify->ovfl_track->ovfl_reuse;

	/*
	 * Discard any overflow records that aren't in-use, freeing underlying
	 * blocks.
	 *
	 * First, walk the overflow reuse lists (except for the lowest one),
	 * fixing up skiplist links.
	 */
	for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i)
		for (e = &head[i]; (reuse = *e) != NULL;) {
			if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) {
				e = &reuse->next[i];
				continue;
			}
			*e = reuse->next[i];
		}

	/*
	 * Second, discard any overflow record without an in-use flag, clear
	 * the flags for the next run.
	 *
	 * As part of the pass through the lowest level, figure out how much
	 * space we added/subtracted from the page, and update its footprint.
	 * We don't get it exactly correct because we don't know the depth of
	 * the skiplist here, but it's close enough, and figuring out the
	 * memory footprint change in the reconciliation wrapup code means
	 * fewer atomic updates and less code overall.
	 */
	decr = 0;
	for (e = &head[0]; (reuse = *e) != NULL;) {
		if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) {
			F_CLR(reuse,
			    WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED);
			e = &reuse->next[0];
			continue;
		}
		*e = reuse->next[0];

		WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED));

		if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW))
			WT_RET(
			    __ovfl_reuse_verbose(session, page, reuse, "free"));

		WT_RET(bm->free(
		    bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size));
		decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE);
		__wt_free(session, reuse);
	}

	if (decr != 0)
		__wt_cache_page_inmem_decr(session, page, decr);
	return (0);
}
Esempio n. 27
0
/*
 * __ckpt_process --
 *	Process the list of checkpoints.
 */
static int
__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
{
	WT_BLOCK_CKPT *a, *b, *ci;
	WT_CKPT *ckpt, *next_ckpt;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint64_t ckpt_size;
	bool deleting, fatal, locked;

	ci = &block->live;
	fatal = locked = false;

#ifdef HAVE_DIAGNOSTIC
	WT_RET(__ckpt_verify(session, ckptbase));
#endif

	/*
	 * Checkpoints are a two-step process: first, write a new checkpoint to
	 * disk (including all the new extent lists for modified checkpoints
	 * and the live system).  As part of this, create a list of file blocks
	 * newly available for reallocation, based on checkpoints being deleted.
	 * We then return the locations of the new checkpoint information to our
	 * caller.  Our caller has to write that information into some kind of
	 * stable storage, and once that's done, we can actually allocate from
	 * that list of newly available file blocks.  (We can't allocate from
	 * that list immediately because the allocation might happen before our
	 * caller saves the new checkpoint information, and if we crashed before
	 * the new checkpoint location was saved, we'd have overwritten blocks
	 * still referenced by checkpoints in the system.)  In summary, there is
	 * a second step: after our caller saves the checkpoint information, we
	 * are called to add the newly available blocks into the live system's
	 * available list.
	 *
	 * This function is the first step, the second step is in the resolve
	 * function.
	 *
	 * If we're called to checkpoint the same file twice (without the second
	 * resolution step), or re-entered for any reason, it's an error in our
	 * caller, and our choices are all bad: leak blocks or potentially crash
	 * with our caller not yet having saved previous checkpoint information
	 * to stable storage.
	 */
	__wt_spin_lock(session, &block->live_lock);
	if (block->ckpt_inprogress)
		ret = __wt_block_panic(session, EINVAL,
		    "%s: unexpected checkpoint ordering", block->name);
	else
		block->ckpt_inprogress = true;
	__wt_spin_unlock(session, &block->live_lock);
	WT_RET(ret);

	/*
	 * Extents newly available as a result of deleting previous checkpoints
	 * are added to a list of extents.  The list should be empty, but as
	 * described above, there is no "free the checkpoint information" call
	 * into the block manager; if there was an error in an upper level that
	 * resulted in some previous checkpoint never being resolved, the list
	 * may not be empty.  We should have caught that with the "checkpoint
	 * in progress" test, but it doesn't cost us anything to be cautious.
	 *
	 * We free the checkpoint's allocation and discard extent lists as part
	 * of the resolution step, not because they're needed at that time, but
	 * because it's potentially a lot of work, and waiting allows the btree
	 * layer to continue eviction sooner.  As for the checkpoint-available
	 * list, make sure they get cleaned out.
	 */
	__wt_block_extlist_free(session, &ci->ckpt_avail);
	WT_RET(__wt_block_extlist_init(
	    session, &ci->ckpt_avail, "live", "ckpt_avail", true));
	__wt_block_extlist_free(session, &ci->ckpt_alloc);
	__wt_block_extlist_free(session, &ci->ckpt_discard);

	/*
	 * To delete a checkpoint, we'll need checkpoint information for it and
	 * the subsequent checkpoint into which it gets rolled; read them from
	 * disk before we lock things down.
	 */
	deleting = false;
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;
		deleting = true;

		/*
		 * Read the checkpoint and next checkpoint extent lists if we
		 * haven't already read them (we may have already read these
		 * extent blocks if there is more than one deleted checkpoint).
		 */
		if (ckpt->bpriv == NULL)
			WT_ERR(__ckpt_extlist_read(session, block, ckpt));

		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * The "next" checkpoint may be the live tree which has no
		 * extent blocks to read.
		 */
		if (next_ckpt->bpriv == NULL &&
		    !F_ISSET(next_ckpt, WT_CKPT_ADD))
			WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
	}

	/*
	 * Failures are now fatal: we can't currently back out the merge of any
	 * deleted checkpoint extent lists into the live system's extent lists,
	 * so continuing after error would leave the live system's extent lists
	 * corrupted for any subsequent checkpoint (and potentially, should a
	 * subsequent checkpoint succeed, for recovery).
	 */
	fatal = true;

	/*
	 * Hold a lock so the live extent lists and the file size can't change
	 * underneath us.  I suspect we'll tighten this if checkpoints take too
	 * much time away from real work: we read the historic checkpoint
	 * information without a lock, but we could also merge and re-write the
	 * deleted and merged checkpoint information without a lock, except for
	 * the final merge of ranges into the live tree.
	 */
	__wt_spin_lock(session, &block->live_lock);
	locked = true;

	/*
	 * We've allocated our last page, update the checkpoint size.  We need
	 * to calculate the live system's checkpoint size before merging
	 * checkpoint allocation and discard information from the checkpoints
	 * we're deleting, those operations change the underlying byte counts.
	 */
	ckpt_size = ci->ckpt_size;
	ckpt_size += ci->alloc.bytes;
	ckpt_size -= ci->discard.bytes;

	/* Skip the additional processing if we aren't deleting checkpoints. */
	if (!deleting)
		goto live_update;

	/*
	 * Delete any no-longer-needed checkpoints: we do this first as it frees
	 * blocks to the live lists, and the freed blocks will then be included
	 * when writing the live extent lists.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;

#ifdef HAVE_VERBOSE
		if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
			if (tmp == NULL)
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__ckpt_string(
			    session, block, ckpt->raw.data, tmp));
			__wt_verbose(session, WT_VERB_CHECKPOINT,
			    "%s: delete-checkpoint: %s: %s",
			    block->name, ckpt->name, (const char *)tmp->data);
		}
#endif
		/*
		 * Find the checkpoint into which we'll roll this checkpoint's
		 * blocks: it's the next real checkpoint in the list, and it
		 * better have been read in (if it's not the add slot).
		 */
		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * Set the from/to checkpoint structures, where the "to" value
		 * may be the live tree.
		 */
		a = ckpt->bpriv;
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			b = &block->live;
		else
			b = next_ckpt->bpriv;

		/*
		 * Free the root page: there's nothing special about this free,
		 * the root page is allocated using normal rules, that is, it
		 * may have been taken from the avail list, and was entered on
		 * the live system's alloc list at that time.  We free it into
		 * the checkpoint's discard list, however, not the live system's
		 * list because it appears on the checkpoint's alloc list and so
		 * must be paired in the checkpoint.
		 */
		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
			WT_ERR(__wt_block_insert_ext(session, block,
			    &a->discard, a->root_offset, a->root_size));

		/*
		 * Free the blocks used to hold the "from" checkpoint's extent
		 * lists, including the avail list.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));

		/*
		 * Roll the "from" alloc and discard extent lists into the "to"
		 * checkpoint's lists.
		 */
		if (a->alloc.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, block, &a->alloc, &b->alloc));
		if (a->discard.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, block, &a->discard, &b->discard));

		/*
		 * If the "to" checkpoint is also being deleted, we're done with
		 * it, it's merged into some other checkpoint in the next loop.
		 * This means the extent lists may aggregate over a number of
		 * checkpoints, but that's OK, they're disjoint sets of ranges.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
			continue;

		/*
		 * Find blocks for re-use: wherever the "to" checkpoint's
		 * allocate and discard lists overlap, move the range to
		 * the live system's checkpoint available list.
		 */
		WT_ERR(__wt_block_extlist_overlap(session, block, b));

		/*
		 * If we're updating the live system's information, we're done.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			continue;

		/*
		 * We have to write the "to" checkpoint's extent lists out in
		 * new blocks, and update its cookie.
		 *
		 * Free the blocks used to hold the "to" checkpoint's extent
		 * lists; don't include the avail list, it's not changing.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));

		F_SET(next_ckpt, WT_CKPT_UPDATE);
	}

	/* Update checkpoints marked for update. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_UPDATE))
			WT_ERR(__ckpt_update(
			    session, block, ckpt, ckpt->bpriv, false));

live_update:
	/* Truncate the file if that's possible. */
	WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));

	/* Update the final, added checkpoint based on the live system. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_ADD)) {
			/*
			 * !!!
			 * Our caller wants the final checkpoint size.  Setting
			 * the size here violates layering, but the alternative
			 * is a call for the btree layer to crack the checkpoint
			 * cookie into its components, and that's a fair amount
			 * of work.
			 */
			ckpt->ckpt_size = ckpt_size;

			/*
			 * Set the rolling checkpoint size for the live system.
			 * The current size includes the current checkpoint's
			 * root page size (root pages are on the checkpoint's
			 * block allocation list as root pages are allocated
			 * with the usual block allocation functions). That's
			 * correct, but we don't want to include it in the size
			 * for the next checkpoint.
			 */
			ckpt_size -= ci->root_size;

			/*
			 * Additionally, we had a bug for awhile where the live
			 * checkpoint size grew without bound. We can't sanity
			 * check the value, that would require walking the tree
			 * as part of the checkpoint. Bound any bug at the size
			 * of the file.
			 * It isn't practical to assert that the value is within
			 * bounds since databases created with older versions
			 * of WiredTiger (2.8.0) would likely see an error.
			 */
			ci->ckpt_size =
			    WT_MIN(ckpt_size, (uint64_t)block->size);

			WT_ERR(__ckpt_update(session, block, ckpt, ci, true));
		}

	/*
	 * Reset the live system's alloc and discard extent lists, leave the
	 * avail list alone.  This includes freeing a lot of extents, so do it
	 * outside of the system's lock by copying and resetting the original,
	 * then doing the work later.
	 */
	ci->ckpt_alloc = ci->alloc;
	WT_ERR(__wt_block_extlist_init(
	    session, &ci->alloc, "live", "alloc", false));
	ci->ckpt_discard = ci->discard;
	WT_ERR(__wt_block_extlist_init(
	    session, &ci->discard, "live", "discard", false));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * The first checkpoint in the system should always have an empty
	 * discard list.  If we've read that checkpoint and/or created it,
	 * check.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (!F_ISSET(ckpt, WT_CKPT_DELETE))
			break;
	if ((a = ckpt->bpriv) == NULL)
		a = &block->live;
	if (a->discard.entries != 0)
		WT_ERR_MSG(session, WT_ERROR,
		    "first checkpoint incorrectly has blocks on the discard "
		    "list");
#endif

err:	if (ret != 0 && fatal)
		ret = __wt_block_panic(session, ret,
		    "%s: fatal checkpoint failure", block->name);

	if (locked)
		__wt_spin_unlock(session, &block->live_lock);

	/* Discard any checkpoint information we loaded. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if ((ci = ckpt->bpriv) != NULL)
			__wt_block_ckpt_destroy(session, ci);

	__wt_scr_free(session, &tmp);
	return (ret);
}
Esempio n. 28
0
/*
 * __wt_txn_update_oldest --
 *	Sweep the running transactions to update the oldest ID required.
 */
int
__wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_SESSION_IMPL *oldest_session;
	WT_TXN_GLOBAL *txn_global;
	uint64_t current_id, last_running, oldest_id;
	uint64_t prev_last_running, prev_oldest_id;
	bool strict, wait;

	conn = S2C(session);
	txn_global = &conn->txn_global;
	strict = LF_ISSET(WT_TXN_OLDEST_STRICT);
	wait = LF_ISSET(WT_TXN_OLDEST_WAIT);

	current_id = last_running = txn_global->current;
	prev_last_running = txn_global->last_running;
	prev_oldest_id = txn_global->oldest_id;

	/*
	 * For pure read-only workloads, or if the update isn't forced and the
	 * oldest ID isn't too far behind, avoid scanning.
	 */
	if (prev_oldest_id == current_id ||
	    (!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100)))
		return (0);

	/* First do a read-only scan. */
	if (wait)
		__wt_readlock(session, txn_global->scan_rwlock);
	else if ((ret =
	    __wt_try_readlock(session, txn_global->scan_rwlock)) != 0)
		return (ret == EBUSY ? 0 : ret);
	__txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session);
	__wt_readunlock(session, txn_global->scan_rwlock);

	/*
	 * If the state hasn't changed (or hasn't moved far enough for
	 * non-forced updates), give up.
	 */
	if ((oldest_id == prev_oldest_id ||
	    (!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) &&
	    ((last_running == prev_last_running) ||
	    (!strict && WT_TXNID_LT(last_running, prev_last_running + 100))))
		return (0);

	/* It looks like an update is necessary, wait for exclusive access. */
	if (wait)
		__wt_writelock(session, txn_global->scan_rwlock);
	else if ((ret =
	    __wt_try_writelock(session, txn_global->scan_rwlock)) != 0)
		return (ret == EBUSY ? 0 : ret);

	/*
	 * If the oldest ID has been updated while we waited, don't bother
	 * scanning.
	 */
	if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) &&
	    WT_TXNID_LE(last_running, txn_global->last_running))
		goto done;

	/*
	 * Re-scan now that we have exclusive access.  This is necessary because
	 * threads get transaction snapshots with read locks, and we have to be
	 * sure that there isn't a thread that has got a snapshot locally but
	 * not yet published its snap_min.
	 */
	__txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session);

#ifdef HAVE_DIAGNOSTIC
	{
	/*
	 * Make sure the ID doesn't move past any named snapshots.
	 *
	 * Don't include the read/assignment in the assert statement.  Coverity
	 * complains if there are assignments only done in diagnostic builds,
	 * and when the read is from a volatile.
	 */
	uint64_t id = txn_global->nsnap_oldest_id;
	WT_ASSERT(session,
	    id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
	}
#endif
	/* Update the oldest ID. */
	if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
		txn_global->oldest_id = oldest_id;
	if (WT_TXNID_LT(txn_global->last_running, last_running)) {
		txn_global->last_running = last_running;

#ifdef HAVE_VERBOSE
		/* Output a verbose message about long-running transactions,
		 * but only when some progress is being made. */
		if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
		    current_id - oldest_id > 10000 && oldest_session != NULL) {
			__wt_verbose(session, WT_VERB_TRANSACTION,
			    "old snapshot %" PRIu64
			    " pinned in session %" PRIu32 " [%s]"
			    " with snap_min %" PRIu64 "\n",
			    oldest_id, oldest_session->id,
			    oldest_session->lastop,
			    oldest_session->txn.snap_min);
		}
#endif
	}

done:	__wt_writeunlock(session, txn_global->scan_rwlock);
	return (ret);
}
Esempio n. 29
0
/*
 * __ckpt_update --
 *	Update a checkpoint.
 */
static int
__ckpt_update(WT_SESSION_IMPL *session,
    WT_BLOCK *block, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, bool is_live)
{
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint8_t *endp;

#ifdef HAVE_DIAGNOSTIC
	/* Check the extent list combinations for overlaps. */
	WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail));
	WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail));
	WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard));
#endif
	/*
	 * Write the checkpoint's alloc and discard extent lists.  After each
	 * write, remove any allocated blocks from the system's allocation
	 * list, checkpoint extent blocks don't appear on any extent lists.
	 */
	WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
	WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));

	/*
	 * We only write an avail list for the live system, other checkpoint's
	 * avail lists are static and never change.
	 *
	 * Write the avail list last so it reflects changes due to allocating
	 * blocks for the alloc and discard lists.  Second, when we write the
	 * live system's avail list, it's two lists: the current avail list
	 * plus the list of blocks to be made available when the new checkpoint
	 * completes.  We can't merge that second list into the real list yet,
	 * it's not truly available until the new checkpoint locations have been
	 * saved to the metadata.
	 */
	if (is_live)
		WT_RET(__wt_block_extlist_write(
		    session, block, &ci->avail, &ci->ckpt_avail));

	/*
	 * Set the file size for the live system.
	 *
	 * !!!
	 * We do NOT set the file size when re-writing checkpoints because we
	 * want to test the checkpoint's blocks against a reasonable maximum
	 * file size during verification.  This is bad: imagine a checkpoint
	 * appearing early in the file, re-written, and then the checkpoint
	 * requires blocks at the end of the file, blocks after the listed file
	 * size.  If the application opens that checkpoint for writing
	 * (discarding subsequent checkpoints), we would truncate the file to
	 * the early chunk, discarding the re-written checkpoint information.
	 * The alternative, updating the file size has its own problems, in
	 * that case we'd work correctly, but we'd lose all of the blocks
	 * between the original checkpoint and the re-written checkpoint.
	 * Currently, there's no API to roll-forward intermediate checkpoints,
	 * if there ever is, this will need to be fixed.
	 */
	if (is_live)
		ci->file_size = block->size;

	/*
	 * Copy the checkpoint information into the checkpoint array's address
	 * cookie.
	 */
	WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE));
	endp = ckpt->raw.mem;
	WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci));
	ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);

	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		WT_RET(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp));
		__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "%s: create-checkpoint: %s: %s",
		    block->name, ckpt->name, (const char *)tmp->data);
	}

err:	__wt_scr_free(session, &tmp);
	return (ret);
}
Esempio n. 30
0
/*
 * __wt_block_checkpoint_load --
 *	Load a checkpoint.
 */
int
__wt_block_checkpoint_load(WT_SESSION_IMPL *session,
    WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size,
    int readonly)
{
	WT_BLOCK_CKPT *ci;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;

	WT_UNUSED(addr_size);

	/*
	 * Sometimes we don't find a root page (we weren't given a checkpoint,
	 * or the referenced checkpoint was empty).  In that case we return a
	 * root page size of 0.  Set that up now.
	 */
	dsk->size = 0;

	ci = &block->live;
	WT_RET(__wt_block_ckpt_init(session, block, ci, "live", 1));

	if (WT_VERBOSE_ISSET(session, ckpt)) {
		if (addr != NULL) {
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__ckpt_string(session, block, addr, tmp));
		}
		WT_VERBOSE_ERR(session, ckpt,
		    "%s: load-checkpoint: %s", block->name,
		    addr == NULL ? "[Empty]" : (char *)tmp->data);
	}

	/* If not loading a checkpoint from disk, we're done. */
	if (addr == NULL || addr_size == 0)
		return (0);

	/* Crack the checkpoint cookie. */
	if (addr != NULL)
		WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci));

	/* Verify sets up next. */
	if (block->verify)
		WT_ERR(__wt_verify_ckpt_load(session, block, ci));

	/* Read, and optionally verify, any root page. */
	if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
		WT_ERR(__wt_block_read_off(session, block,
		    dsk, ci->root_offset, ci->root_size, ci->root_cksum));
		if (block->verify) {
			if (tmp == NULL) {
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
				WT_ERR(__ckpt_string(
				    session, block, addr, tmp));
			}
			WT_ERR(
			    __wt_verify_dsk(session, (char *)tmp->data, dsk));
		}
	}

	/*
	 * Rolling a checkpoint forward requires the avail list, the blocks from
	 * which we can allocate.
	 */
	if (!readonly)
		WT_ERR(
		    __wt_block_extlist_read_avail(session, block, &ci->avail));

	/*
	 * If the checkpoint can be written, that means anything written after
	 * the checkpoint is no longer interesting, truncate the file.  Don't
	 * bother checking the avail list for a block at the end of the file,
	 * that was done when the checkpoint was first written (re-writing the
	 * checkpoint might possibly make it relevant here, but it's unlikely
	 * enough that I'm not bothering).
	 */
	if (!readonly) {
		WT_VERBOSE_ERR(session, ckpt,
		    "truncate file to %" PRIuMAX, (uintmax_t)ci->file_size);
		WT_ERR(__wt_ftruncate(session, block->fh, ci->file_size));
	}

	if (0) {
err:		(void)__wt_block_checkpoint_unload(session, block);
	}

	__wt_scr_free(&tmp);
	return (ret);
}