/* * __ckpt_extlist_read -- * Read a checkpoints extent lists and copy */ static int __ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt) { WT_BLOCK_CKPT *ci; /* * Allocate a checkpoint structure, crack the cookie and read the * checkpoint's extent lists. * * Ignore the avail list: checkpoint avail lists are only useful if we * are rolling forward from the particular checkpoint and they represent * our best understanding of what blocks can be allocated. If we are * not operating on the live checkpoint, subsequent checkpoints might * have allocated those blocks, and the avail list is useless. We don't * discard it, because it is useful as part of verification, but we * don't re-write it either. */ WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv)); ci = ckpt->bpriv; WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name)); WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci)); WT_RET(__wt_block_extlist_read( session, block, &ci->alloc, ci->file_size)); WT_RET(__wt_block_extlist_read( session, block, &ci->discard, ci->file_size)); return (0); }
/* * __wt_block_snapshot_load -- * Load a snapshot. */ int __wt_block_snapshot_load(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size, int readonly) { WT_BLOCK_SNAPSHOT *si; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_UNUSED(addr_size); /* * Sometimes we don't find a root page (we weren't given a snapshot, * or the referenced snapshot was empty). In that case we return a * root page size of 0. Set that up now. */ dsk->size = 0; si = &block->live; WT_RET(__wt_block_snap_init(session, block, si, "live", 1)); if (WT_VERBOSE_ISSET(session, snapshot)) { if (addr != NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string(session, block, addr, tmp)); } WT_VERBOSE_ERR(session, snapshot, "%s: load-snapshot: %s", block->name, addr == NULL ? "[Empty]" : (char *)tmp->data); } /* If not loading a snapshot from disk, we're done. */ if (addr == NULL || addr_size == 0) return (0); /* Crack the snapshot cookie. */ if (addr != NULL) WT_ERR(__wt_block_buffer_to_snapshot(session, block, addr, si)); /* Verify sets up next. */ if (block->verify) WT_ERR(__wt_verify_snap_load(session, block, si)); /* Read, and optionally verify, any root page. */ if (si->root_offset != WT_BLOCK_INVALID_OFFSET) { WT_ERR(__wt_block_read_off(session, block, dsk, si->root_offset, si->root_size, si->root_cksum)); if (block->verify) { if (tmp == NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string( session, block, addr, tmp)); } WT_ERR( __wt_verify_dsk(session, (char *)tmp->data, dsk)); } } /* * Rolling a snapshot forward requires the avail list, the blocks from * which we can allocate. */ if (!readonly) WT_ERR(__wt_block_extlist_read(session, block, &si->avail)); /* * If the snapshot can be written, that means anything written after * the snapshot is no longer interesting. Truncate the file. */ if (!readonly) { WT_VERBOSE_ERR(session, snapshot, "truncate file to %" PRIuMAX, (uintmax_t)si->file_size); WT_ERR(__wt_ftruncate(session, block->fh, si->file_size)); } if (0) { err: (void)__wt_block_snapshot_unload(session, block); } __wt_scr_free(&tmp); return (ret); }
/* * __snapshot_process -- * Process the list of snapshots. */ static int __snapshot_process( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase) { WT_BLOCK_SNAPSHOT *a, *b, *si; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_SNAPSHOT *snap; uint64_t snapshot_size; int deleting, locked; si = &block->live; locked = 0; /* * We've allocated our last page, update the snapshot size. We need to * calculate the live system's snapshot size before reading and merging * snapshot allocation and discard information from the snapshots we're * deleting, those operations will change the underlying byte counts. */ snapshot_size = si->snapshot_size; snapshot_size += si->alloc.bytes; snapshot_size -= si->discard.bytes; /* * Extents that become newly available as a result of deleting previous * snapshots are added to a list of extents. The list should be empty, * but there's no explicit "free the snapshot information" call into the * block manager; if there was an error in an upper level resulting in * the snapshot never being "resolved", the list might not be empty. * * XXX * This isn't sufficient, actually: we're going to leak all the blocks * that were written as part of the last snapshot because it was never * resolved. */ __wt_block_extlist_free(session, &si->snapshot_avail); WT_RET(__wt_block_extlist_init( session, &si->snapshot_avail, "live", "snapshot_avail")); /* * To delete a snapshot, we'll need snapshot information for it, and we * have to read that from the disk. */ deleting = 0; WT_SNAPSHOT_FOREACH(snapbase, snap) { /* * To delete a snapshot, we'll need snapshot information for it * and the subsequent snapshot. The test is tricky, we have to * load the current snapshot's information if it's marked for * deletion, or if it follows a snapshot marked for deletion, * where the boundary cases are the first snapshot in the list * and the last snapshot in the list: if we're deleting the last * snapshot in the list, there's no next snapshot, the snapshot * will be merged into the live tree. */ if (!F_ISSET(snap, WT_SNAP_DELETE) && (snap == snapbase || F_ISSET(snap, WT_SNAP_ADD) || !F_ISSET(snap - 1, WT_SNAP_DELETE))) continue; deleting = 1; /* * Allocate a snapshot structure, crack the cookie and read the * snapshot's extent lists. * * Ignore the avail list: snapshot avail lists are only useful * if we are rolling forward from the particular snapshot and * they represent our best understanding of what blocks can be * allocated. If we are not operating on the live snapshot, * subsequent snapshots might have allocated those blocks, and * the avail list is useless. We don't discard it, because it * is useful as part of verification, but we don't re-write it * either. */ WT_ERR(__wt_calloc( session, 1, sizeof(WT_BLOCK_SNAPSHOT), &snap->bpriv)); si = snap->bpriv; WT_ERR(__wt_block_snap_init(session, block, si, snap->name, 0)); WT_ERR(__wt_block_buffer_to_snapshot( session, block, snap->raw.data, si)); WT_ERR(__wt_block_extlist_read(session, block, &si->alloc)); WT_ERR(__wt_block_extlist_read(session, block, &si->discard)); } /* * Hold a lock so the live extent lists and the file size can't change * underneath us. I suspect we'll tighten this if snapshots take too * much time away from real work: we read historic snapshot information * without a lock, but we could also merge and re-write the delete * snapshot information without a lock, except for ranges merged into * the live tree. */ __wt_spin_lock(session, &block->live_lock); locked = 1; /* Skip the additional processing if we aren't deleting snapshots. */ if (!deleting) goto live_update; /* * Delete any no-longer-needed snapshots: we do this first as it frees * blocks to the live lists, and the freed blocks will then be included * when writing the live extent lists. */ WT_SNAPSHOT_FOREACH(snapbase, snap) { if (!F_ISSET(snap, WT_SNAP_DELETE)) continue; if (WT_VERBOSE_ISSET(session, snapshot)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string( session, block, snap->raw.data, tmp)); WT_VERBOSE_ERR(session, snapshot, "%s: delete-snapshot: %s: %s", block->name, snap->name, (char *)tmp->data); } /* * Set the from/to snapshot structures, where the "to" value * may be the live tree. */ a = snap->bpriv; if (F_ISSET(snap + 1, WT_SNAP_ADD)) b = &block->live; else b = (snap + 1)->bpriv; /* * Free the root page: there's nothing special about this free, * the root page is allocated using normal rules, that is, it * may have been taken from the avail list, and was entered on * the live system's alloc list at that time. We free it into * the snapshot's discard list, however, not the live system's * list because it appears on the snapshot's alloc list and so * must be paired in the snapshot. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) WT_ERR(__wt_block_insert_ext(session, &a->discard, a->root_offset, a->root_size)); /* * Free the blocks used to hold the "from" snapshot's extent * lists directly to the live system's avail list, they were * never on any alloc list. Include the "from" snapshot's * avail list, it's going away. */ WT_ERR(__snapshot_extlist_fblocks(session, block, &a->alloc)); WT_ERR(__snapshot_extlist_fblocks(session, block, &a->avail)); WT_ERR(__snapshot_extlist_fblocks(session, block, &a->discard)); /* * Roll the "from" alloc and discard extent lists into the "to" * snapshot's lists. */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( session, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( session, &a->discard, &b->discard)); /* * If the "to" snapshot is also being deleted, we're done with * it, it's merged into some other snapshot in the next loop. * This means the extent lists may aggregate over a number of * snapshots, but that's OK, they're disjoint sets of ranges. */ if (F_ISSET(snap + 1, WT_SNAP_DELETE)) continue; /* * Find blocks for re-use: wherever the "to" snapshot's allocate * and discard lists overlap is fair game, move ranges appearing * on both lists to the live snapshot's newly available list. */ WT_ERR(__wt_block_extlist_overlap(session, block, b)); /* * If we're updating the live system's information, we're done. */ if (F_ISSET(snap + 1, WT_SNAP_ADD)) continue; /* * We have to write the "to" snapshot's extent lists out in new * blocks, and update its cookie. * * Free the blocks used to hold the "to" snapshot's extent lists * directly to the live system's avail list, they were never on * any alloc list. Do not include the "to" snapshot's avail * list, it's not changing. */ WT_ERR(__snapshot_extlist_fblocks(session, block, &b->alloc)); WT_ERR(__snapshot_extlist_fblocks(session, block, &b->discard)); F_SET(snap + 1, WT_SNAP_UPDATE); } /* Update snapshots marked for update. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (F_ISSET(snap, WT_SNAP_UPDATE)) { WT_ASSERT(session, !F_ISSET(snap, WT_SNAP_ADD)); WT_ERR(__snapshot_update( session, block, snap, snap->bpriv, 0, 0)); } live_update: si = &block->live; /* Truncate the file if that's possible. */ WT_ERR(__wt_block_extlist_truncate(session, block, &si->avail)); /* Update the final, added snapshot based on the live system. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (F_ISSET(snap, WT_SNAP_ADD)) { WT_ERR(__snapshot_update( session, block, snap, si, snapshot_size, 1)); /* * XXX * Our caller wants two pieces of information: the time * the snapshot was taken and the final snapshot size. * This violates layering but the alternative is a call * for the btree layer to crack the snapshot cookie into * its components, and that's a fair amount of work. * (We could just read the system time in the session * layer when updating the metadata file, but that won't * work for the snapshot size, and so we do both here.) */ snap->snapshot_size = si->snapshot_size; WT_ERR(__wt_epoch(session, &snap->sec, NULL)); } /* * Reset the live system's alloc and discard extent lists, leave the * avail list alone. */ __wt_block_extlist_free(session, &si->alloc); WT_ERR(__wt_block_extlist_init(session, &si->alloc, "live", "alloc")); __wt_block_extlist_free(session, &si->discard); WT_ERR( __wt_block_extlist_init(session, &si->discard, "live", "discard")); #ifdef HAVE_DIAGNOSTIC /* * The first snapshot in the system should always have an empty discard * list. If we've read that snapshot and/or created it, check. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (!F_ISSET(snap, WT_SNAP_DELETE)) break; if ((a = snap->bpriv) == NULL) a = &block->live; if (a->discard.entries != 0) { __wt_errx(session, "snapshot incorrectly has blocks on the discard list"); WT_ERR(WT_ERROR); } #endif err: if (locked) __wt_spin_unlock(session, &block->live_lock); /* Discard any snapshot information we loaded, we no longer need it. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if ((si = snap->bpriv) != NULL) { __wt_block_extlist_free(session, &si->alloc); __wt_block_extlist_free(session, &si->avail); __wt_block_extlist_free(session, &si->discard); } __wt_scr_free(&tmp); return (ret); }