Esempio n. 1
0
/*
 * __ckpt_extlist_read --
 *	Read a checkpoints extent lists and copy
 */
static int
__ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
{
	WT_BLOCK_CKPT *ci;

	/*
	 * Allocate a checkpoint structure, crack the cookie and read the
	 * checkpoint's extent lists.
	 *
	 * Ignore the avail list: checkpoint avail lists are only useful if we
	 * are rolling forward from the particular checkpoint and they represent
	 * our best understanding of what blocks can be allocated.  If we are
	 * not operating on the live checkpoint, subsequent checkpoints might
	 * have allocated those blocks, and the avail list is useless.  We don't
	 * discard it, because it is useful as part of verification, but we
	 * don't re-write it either.
	 */
	WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));

	ci = ckpt->bpriv;
	WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
	WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
	WT_RET(__wt_block_extlist_read(
	    session, block, &ci->alloc, ci->file_size));
	WT_RET(__wt_block_extlist_read(
	    session, block, &ci->discard, ci->file_size));

	return (0);
}
Esempio n. 2
0
/*
 * __wt_block_snapshot_load --
 *	Load a snapshot.
 */
int
__wt_block_snapshot_load(WT_SESSION_IMPL *session,
    WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size,
    int readonly)
{
	WT_BLOCK_SNAPSHOT *si;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;

	WT_UNUSED(addr_size);

	/*
	 * Sometimes we don't find a root page (we weren't given a snapshot,
	 * or the referenced snapshot was empty).  In that case we return a
	 * root page size of 0.  Set that up now.
	 */
	dsk->size = 0;

	si = &block->live;
	WT_RET(__wt_block_snap_init(session, block, si, "live", 1));

	if (WT_VERBOSE_ISSET(session, snapshot)) {
		if (addr != NULL) {
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__snapshot_string(session, block, addr, tmp));
		}
		WT_VERBOSE_ERR(session, snapshot,
		    "%s: load-snapshot: %s", block->name,
		    addr == NULL ? "[Empty]" : (char *)tmp->data);
	}

	/* If not loading a snapshot from disk, we're done. */
	if (addr == NULL || addr_size == 0)
		return (0);

	/* Crack the snapshot cookie. */
	if (addr != NULL)
		WT_ERR(__wt_block_buffer_to_snapshot(session, block, addr, si));

	/* Verify sets up next. */
	if (block->verify)
		WT_ERR(__wt_verify_snap_load(session, block, si));

	/* Read, and optionally verify, any root page. */
	if (si->root_offset != WT_BLOCK_INVALID_OFFSET) {
		WT_ERR(__wt_block_read_off(session, block,
		    dsk, si->root_offset, si->root_size, si->root_cksum));
		if (block->verify) {
			if (tmp == NULL) {
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
				WT_ERR(__snapshot_string(
				    session, block, addr, tmp));
			}
			WT_ERR(
			    __wt_verify_dsk(session, (char *)tmp->data, dsk));
		}
	}

	/*
	 * Rolling a snapshot forward requires the avail list, the blocks from
	 * which we can allocate.
	 */
	if (!readonly)
		WT_ERR(__wt_block_extlist_read(session, block, &si->avail));

	/*
	 * If the snapshot can be written, that means anything written after
	 * the snapshot is no longer interesting.  Truncate the file.
	 */
	if (!readonly) {
		WT_VERBOSE_ERR(session, snapshot,
		    "truncate file to %" PRIuMAX, (uintmax_t)si->file_size);
		WT_ERR(__wt_ftruncate(session, block->fh, si->file_size));
	}

	if (0) {
err:		(void)__wt_block_snapshot_unload(session, block);
	}

	__wt_scr_free(&tmp);
	return (ret);
}
Esempio n. 3
0
/*
 * __snapshot_process --
 *	Process the list of snapshots.
 */
static int
__snapshot_process(
    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase)
{
	WT_BLOCK_SNAPSHOT *a, *b, *si;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_SNAPSHOT *snap;
	uint64_t snapshot_size;
	int deleting, locked;

	si = &block->live;
	locked = 0;

	/*
	 * We've allocated our last page, update the snapshot size.  We need to
	 * calculate the live system's snapshot size before reading and merging
	 * snapshot allocation and discard information from the snapshots we're
	 * deleting, those operations will change the underlying byte counts.
	 */
	snapshot_size = si->snapshot_size;
	snapshot_size += si->alloc.bytes;
	snapshot_size -= si->discard.bytes;

	/*
	 * Extents that become newly available as a result of deleting previous
	 * snapshots are added to a list of extents.  The list should be empty,
	 * but there's no explicit "free the snapshot information" call into the
	 * block manager; if there was an error in an upper level resulting in
	 * the snapshot never being "resolved", the list might not be empty.
	 *
	 * XXX
	 * This isn't sufficient, actually: we're going to leak all the blocks
	 * that were written as part of the last snapshot because it was never
	 * resolved.
	 */
	__wt_block_extlist_free(session, &si->snapshot_avail);
	WT_RET(__wt_block_extlist_init(
	    session, &si->snapshot_avail, "live", "snapshot_avail"));

	/*
	 * To delete a snapshot, we'll need snapshot information for it, and we
	 * have to read that from the disk.
	 */
	deleting = 0;
	WT_SNAPSHOT_FOREACH(snapbase, snap) {
		/*
		 * To delete a snapshot, we'll need snapshot information for it
		 * and the subsequent snapshot.  The test is tricky, we have to
		 * load the current snapshot's information if it's marked for
		 * deletion, or if it follows a snapshot marked for deletion,
		 * where the boundary cases are the first snapshot in the list
		 * and the last snapshot in the list: if we're deleting the last
		 * snapshot in the list, there's no next snapshot, the snapshot
		 * will be merged into the live tree.
		 */
		if (!F_ISSET(snap, WT_SNAP_DELETE) &&
		    (snap == snapbase ||
		    F_ISSET(snap, WT_SNAP_ADD) ||
		    !F_ISSET(snap - 1, WT_SNAP_DELETE)))
			continue;
		deleting = 1;

		/*
		 * Allocate a snapshot structure, crack the cookie and read the
		 * snapshot's extent lists.
		 *
		 * Ignore the avail list: snapshot avail lists are only useful
		 * if we are rolling forward from the particular snapshot and
		 * they represent our best understanding of what blocks can be
		 * allocated.  If we are not operating on the live snapshot,
		 * subsequent snapshots might have allocated those blocks, and
		 * the avail list is useless.  We don't discard it, because it
		 * is useful as part of verification, but we don't re-write it
		 * either.
		 */
		WT_ERR(__wt_calloc(
		    session, 1, sizeof(WT_BLOCK_SNAPSHOT), &snap->bpriv));
		si = snap->bpriv;
		WT_ERR(__wt_block_snap_init(session, block, si, snap->name, 0));
		WT_ERR(__wt_block_buffer_to_snapshot(
		    session, block, snap->raw.data, si));
		WT_ERR(__wt_block_extlist_read(session, block, &si->alloc));
		WT_ERR(__wt_block_extlist_read(session, block, &si->discard));
	}

	/*
	 * Hold a lock so the live extent lists and the file size can't change
	 * underneath us.  I suspect we'll tighten this if snapshots take too
	 * much time away from real work: we read historic snapshot information
	 * without a lock, but we could also merge and re-write the delete
	 * snapshot information without a lock, except for ranges merged into
	 * the live tree.
	 */
	__wt_spin_lock(session, &block->live_lock);
	locked = 1;

	/* Skip the additional processing if we aren't deleting snapshots. */
	if (!deleting)
		goto live_update;

	/*
	 * Delete any no-longer-needed snapshots: we do this first as it frees
	 * blocks to the live lists, and the freed blocks will then be included
	 * when writing the live extent lists.
	 */
	WT_SNAPSHOT_FOREACH(snapbase, snap) {
		if (!F_ISSET(snap, WT_SNAP_DELETE))
			continue;

		if (WT_VERBOSE_ISSET(session, snapshot)) {
			if (tmp == NULL)
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__snapshot_string(
			    session, block, snap->raw.data, tmp));
			WT_VERBOSE_ERR(session, snapshot,
			    "%s: delete-snapshot: %s: %s",
			    block->name, snap->name, (char *)tmp->data);
		}

		/*
		 * Set the from/to snapshot structures, where the "to" value
		 * may be the live tree.
		 */
		a = snap->bpriv;
		if (F_ISSET(snap + 1, WT_SNAP_ADD))
			b = &block->live;
		else
			b = (snap + 1)->bpriv;

		/*
		 * Free the root page: there's nothing special about this free,
		 * the root page is allocated using normal rules, that is, it
		 * may have been taken from the avail list, and was entered on
		 * the live system's alloc list at that time.  We free it into
		 * the snapshot's discard list, however, not the live system's
		 * list because it appears on the snapshot's alloc list and so
		 * must be paired in the snapshot.
		 */
		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
			WT_ERR(__wt_block_insert_ext(session,
			    &a->discard, a->root_offset, a->root_size));

		/*
		 * Free the blocks used to hold the "from" snapshot's extent
		 * lists directly to the live system's avail list, they were
		 * never on any alloc list.   Include the "from" snapshot's
		 * avail list, it's going away.
		 */
		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->alloc));
		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->avail));
		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->discard));

		/*
		 * Roll the "from" alloc and discard extent lists into the "to"
		 * snapshot's lists.
		 */
		if (a->alloc.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, &a->alloc, &b->alloc));
		if (a->discard.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, &a->discard, &b->discard));

		/*
		 * If the "to" snapshot is also being deleted, we're done with
		 * it, it's merged into some other snapshot in the next loop.
		 * This means the extent lists may aggregate over a number of
		 * snapshots, but that's OK, they're disjoint sets of ranges.
		 */
		if (F_ISSET(snap + 1, WT_SNAP_DELETE))
			continue;

		/*
		 * Find blocks for re-use: wherever the "to" snapshot's allocate
		 * and discard lists overlap is fair game, move ranges appearing
		 * on both lists to the live snapshot's newly available list.
		 */
		WT_ERR(__wt_block_extlist_overlap(session, block, b));

		/*
		 * If we're updating the live system's information, we're done.
		 */
		if (F_ISSET(snap + 1, WT_SNAP_ADD))
			continue;

		/*
		 * We have to write the "to" snapshot's extent lists out in new
		 * blocks, and update its cookie.
		 *
		 * Free the blocks used to hold the "to" snapshot's extent lists
		 * directly to the live system's avail list, they were never on
		 * any alloc list.  Do not include the "to" snapshot's avail
		 * list, it's not changing.
		 */
		WT_ERR(__snapshot_extlist_fblocks(session, block, &b->alloc));
		WT_ERR(__snapshot_extlist_fblocks(session, block, &b->discard));

		F_SET(snap + 1, WT_SNAP_UPDATE);
	}

	/* Update snapshots marked for update. */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if (F_ISSET(snap, WT_SNAP_UPDATE)) {
			WT_ASSERT(session, !F_ISSET(snap, WT_SNAP_ADD));
			WT_ERR(__snapshot_update(
			    session, block, snap, snap->bpriv, 0, 0));
		}

live_update:
	si = &block->live;

	/* Truncate the file if that's possible. */
	WT_ERR(__wt_block_extlist_truncate(session, block, &si->avail));

	/* Update the final, added snapshot based on the live system. */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if (F_ISSET(snap, WT_SNAP_ADD)) {
			WT_ERR(__snapshot_update(
			    session, block, snap, si, snapshot_size, 1));

			/*
			 * XXX
			 * Our caller wants two pieces of information: the time
			 * the snapshot was taken and the final snapshot size.
			 * This violates layering but the alternative is a call
			 * for the btree layer to crack the snapshot cookie into
			 * its components, and that's a fair amount of work.
			 * (We could just read the system time in the session
			 * layer when updating the metadata file, but that won't
			 * work for the snapshot size, and so we do both here.)
			 */
			snap->snapshot_size = si->snapshot_size;
			WT_ERR(__wt_epoch(session, &snap->sec, NULL));
		}

	/*
	 * Reset the live system's alloc and discard extent lists, leave the
	 * avail list alone.
	 */
	__wt_block_extlist_free(session, &si->alloc);
	WT_ERR(__wt_block_extlist_init(session, &si->alloc, "live", "alloc"));
	__wt_block_extlist_free(session, &si->discard);
	WT_ERR(
	    __wt_block_extlist_init(session, &si->discard, "live", "discard"));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * The first snapshot in the system should always have an empty discard
	 * list.  If we've read that snapshot and/or created it, check.
	 */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if (!F_ISSET(snap, WT_SNAP_DELETE))
			break;
	if ((a = snap->bpriv) == NULL)
		a = &block->live;
	if (a->discard.entries != 0) {
		__wt_errx(session,
		    "snapshot incorrectly has blocks on the discard list");
		WT_ERR(WT_ERROR);
	}
#endif

err:	if (locked)
		__wt_spin_unlock(session, &block->live_lock);

	/* Discard any snapshot information we loaded, we no longer need it. */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if ((si = snap->bpriv) != NULL) {
			__wt_block_extlist_free(session, &si->alloc);
			__wt_block_extlist_free(session, &si->avail);
			__wt_block_extlist_free(session, &si->discard);
		}

	__wt_scr_free(&tmp);
	return (ret);
}