/* * __wt_lsm_tree_switch -- * Switch to a new in-memory tree. */ int __wt_lsm_tree_switch( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk, **cp; uint32_t in_memory, new_id; new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) > lsm_tree->chunk_alloc) WT_ERR(__wt_realloc(session, &lsm_tree->chunk_alloc, WT_MAX(10 * sizeof(*lsm_tree->chunk), 2 * lsm_tree->chunk_alloc), &lsm_tree->chunk)); /* * In the steady state, we expect that the checkpoint worker thread * will keep up with inserts. If not, we throttle the insert rate to * avoid filling the cache with in-memory chunks. Threads sleep every * 100 operations, so take that into account in the calculation. */ for (in_memory = 1, cp = lsm_tree->chunk + lsm_tree->nchunks - 1; in_memory < lsm_tree->nchunks && !F_ISSET(*cp, WT_LSM_CHUNK_ONDISK); ++in_memory, --cp) ; if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 2) lsm_tree->throttle_sleep = 0; else if (in_memory == lsm_tree->nchunks || F_ISSET(*cp, WT_LSM_CHUNK_STABLE)) { /* * No checkpoint has completed this run. Keep slowing down * inserts until one does. */ lsm_tree->throttle_sleep = WT_MAX(20, 2 * lsm_tree->throttle_sleep); } else { chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]; lsm_tree->throttle_sleep = (long)((in_memory - 2) * WT_TIMEDIFF(chunk->create_ts, (*cp)->create_ts) / (20 * in_memory * chunk->count)); } WT_VERBOSE_ERR(session, lsm, "Tree switch to: %d, throttle %d", new_id, (int)lsm_tree->throttle_sleep); WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = new_id; lsm_tree->chunk[lsm_tree->nchunks++] = chunk; WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); ++lsm_tree->dsk_gen; F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH); WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); err: /* TODO: mark lsm_tree bad on error(?) */ return (ret); }
/* * __wt_block_compact_skip -- * Return if compaction will shrink the file. */ int __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp) { WT_DECL_RET; WT_EXT *ext; WT_EXTLIST *el; WT_FH *fh; off_t avail, ninety; *skipp = 1; /* Return a default skip. */ fh = block->fh; /* * We do compaction by copying blocks from the end of the file to the * beginning of the file, and we need some metrics to decide if it's * worth doing. Ignore small files, and files where we are unlikely * to recover 10% of the file. */ if (fh->size <= 10 * 1024) return (0); __wt_spin_lock(session, &block->live_lock); if (WT_VERBOSE_ISSET(session, compact)) WT_ERR(__block_dump_avail(session, block)); /* Sum the number of available bytes in the first 90% of the file. */ avail = 0; ninety = fh->size - fh->size / 10; el = &block->live.avail; WT_EXT_FOREACH(ext, el->off) if (ext->off < ninety) avail += ext->size; /* * If at least 10% of the total file is available and in the first 90% * of the file, we'll try compaction. */ if (avail >= fh->size / 10) *skipp = 0; WT_VERBOSE_ERR(session, compact, "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " "90%% of the file, require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") to perform compaction, compaction %s", block->name, (uintmax_t)avail / WT_MEGABYTE, (uintmax_t)avail, (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10, *skipp ? "skipped" : "proceeding"); err: __wt_spin_unlock(session, &block->live_lock); return (ret); }
/* * __desc_read -- * Read and verify the file's metadata. */ static int __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_BLOCK_DESC *desc; WT_DECL_ITEM(buf); WT_DECL_RET; uint32_t cksum; /* Use a scratch buffer to get correct alignment for direct I/O. */ WT_RET(__wt_scr_alloc(session, block->allocsize, &buf)); /* Read the first allocation-sized block and verify the file format. */ WT_ERR(__wt_read( session, block->fh, (off_t)0, block->allocsize, buf->mem)); desc = buf->mem; WT_VERBOSE_ERR(session, block, "%s: magic %" PRIu32 ", major/minor: %" PRIu32 "/%" PRIu32 ", checksum %#" PRIx32, block->name, desc->magic, desc->majorv, desc->minorv, desc->cksum); /* * We fail the open if the checksum fails, or the magic number is wrong * or the major/minor numbers are unsupported for this version. This * test is done even if the caller is verifying or salvaging the file: * it makes sense for verify, and for salvage we don't overwrite files * without some reason to believe they are WiredTiger files. The user * may have entered the wrong file name, and is now frantically pounding * their interrupt key. */ cksum = desc->cksum; desc->cksum = 0; if (desc->magic != WT_BLOCK_MAGIC || cksum != __wt_cksum(desc, block->allocsize)) WT_ERR_MSG(session, WT_ERROR, "%s does not appear to be a WiredTiger file", block->name); if (desc->majorv > WT_BLOCK_MAJOR_VERSION || (desc->majorv == WT_BLOCK_MAJOR_VERSION && desc->minorv > WT_BLOCK_MINOR_VERSION)) WT_ERR_MSG(session, WT_ERROR, "unsupported WiredTiger file version: this build only " "supports major/minor versions up to %d/%d, and the file " "is version %d/%d", WT_BLOCK_MAJOR_VERSION, WT_BLOCK_MINOR_VERSION, desc->majorv, desc->minorv); err: __wt_scr_free(&buf); return (ret); }
/* * __wt_lsm_tree_switch -- * Switch to a new in-memory tree. */ int __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; uint32_t nchunks, new_id; WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 1)); /* * Check if a switch is still needed: we may have raced while waiting * for a lock. */ if ((nchunks = lsm_tree->nchunks) != 0 && (chunk = lsm_tree->chunk[nchunks - 1]) != NULL && !F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) goto err; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree); new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, nchunks + 1, &lsm_tree->chunk)); WT_VERBOSE_ERR(session, lsm, "Tree switch to: %" PRIu32 ", throttle %ld", new_id, lsm_tree->throttle_sleep); WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = new_id; chunk->txnid_max = WT_TXN_NONE; lsm_tree->chunk[lsm_tree->nchunks++] = chunk; WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH); ++lsm_tree->dsk_gen; lsm_tree->modified = 1; err: /* TODO: mark lsm_tree bad on error(?) */ WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); return (ret); }
/* * __track_msg -- * Output a verbose message and associated page and address pair. */ static int __track_msg(WT_SESSION_IMPL *session, WT_PAGE *page, const char *msg, WT_PAGE_TRACK *track) { WT_DECL_RET; WT_DECL_ITEM(buf); char f[64]; WT_RET(__wt_scr_alloc(session, 64, &buf)); WT_VERBOSE_ERR( session, reconcile, "page %p %s (%s) %" PRIu32 "B @%s", page, msg, __wt_track_string(track, f, sizeof(f)), track->size, __wt_addr_string(session, buf, track->addr.addr, track->addr.size)); err: __wt_scr_free(&buf); return (ret); }
/* * __wt_rwlock_alloc -- * Allocate and initialize a read/write lock. */ int __wt_rwlock_alloc( WT_SESSION_IMPL *session, const char *name, WT_RWLOCK **rwlockp) { WT_DECL_RET; WT_RWLOCK *rwlock; WT_RET(__wt_calloc(session, 1, sizeof(WT_RWLOCK), &rwlock)); WT_ERR_TEST(pthread_rwlock_init(&rwlock->rwlock, NULL), WT_ERROR); rwlock->name = name; *rwlockp = rwlock; WT_VERBOSE_ERR(session, mutex, "rwlock: alloc %s (%p)", rwlock->name, rwlock); if (0) { err: __wt_free(session, rwlock); } return (ret); }
/* * __wt_block_checkpoint_load -- * Load a checkpoint. */ int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size, int readonly) { WT_BLOCK_CKPT *ci; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_UNUSED(addr_size); /* * Sometimes we don't find a root page (we weren't given a checkpoint, * or the referenced checkpoint was empty). In that case we return a * root page size of 0. Set that up now. */ dsk->size = 0; ci = &block->live; WT_RET(__wt_block_ckpt_init(session, block, ci, "live", 1)); if (WT_VERBOSE_ISSET(session, ckpt)) { if (addr != NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string(session, block, addr, tmp)); } WT_VERBOSE_ERR(session, ckpt, "%s: load-checkpoint: %s", block->name, addr == NULL ? "[Empty]" : (char *)tmp->data); } /* If not loading a checkpoint from disk, we're done. */ if (addr == NULL || addr_size == 0) return (0); /* Crack the checkpoint cookie. */ if (addr != NULL) WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci)); /* Verify sets up next. */ if (block->verify) WT_ERR(__wt_verify_ckpt_load(session, block, ci)); /* Read, and optionally verify, any root page. */ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) { WT_ERR(__wt_block_read_off(session, block, dsk, ci->root_offset, ci->root_size, ci->root_cksum)); if (block->verify) { if (tmp == NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string( session, block, addr, tmp)); } WT_ERR( __wt_verify_dsk(session, (char *)tmp->data, dsk)); } } /* * Rolling a checkpoint forward requires the avail list, the blocks from * which we can allocate. */ if (!readonly) WT_ERR( __wt_block_extlist_read_avail(session, block, &ci->avail)); /* * If the checkpoint can be written, that means anything written after * the checkpoint is no longer interesting, truncate the file. Don't * bother checking the avail list for a block at the end of the file, * that was done when the checkpoint was first written (re-writing the * checkpoint might possibly make it relevant here, but it's unlikely * enough that I'm not bothering). */ if (!readonly) { WT_VERBOSE_ERR(session, ckpt, "truncate file to %" PRIuMAX, (uintmax_t)ci->file_size); WT_ERR(__wt_ftruncate(session, block->fh, ci->file_size)); } if (0) { err: (void)__wt_block_checkpoint_unload(session, block); } __wt_scr_free(&tmp); return (ret); }
/* * __ckpt_update -- * Update a checkpoint. */ static int __ckpt_update( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, uint64_t ckpt_size, int is_live) { WT_EXTLIST *alloc; WT_DECL_ITEM(tmp); WT_DECL_RET; uint8_t *endp; #ifdef HAVE_DIAGNOSTIC /* Check the extent list combinations for overlaps. */ WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail)); WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail)); WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard)); #endif /* * Write the checkpoint's alloc and discard extent lists. After each * write, remove any allocated blocks from the system's allocation * list, checkpoint extent blocks don't appear on any extent lists. */ alloc = &block->live.alloc; WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL)); if (ci->alloc.offset != WT_BLOCK_INVALID_OFFSET) WT_RET(__wt_block_off_remove_overlap( session, alloc, ci->alloc.offset, ci->alloc.size)); WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL)); if (ci->discard.offset != WT_BLOCK_INVALID_OFFSET) WT_RET(__wt_block_off_remove_overlap( session, alloc, ci->discard.offset, ci->discard.size)); /* * We only write an avail list for the live system, other checkpoint's * avail lists are static and never change. * * Write the avail list last so it reflects changes due to allocating * blocks for the alloc and discard lists. Second, when we write the * live system's avail list, it's two lists: the current avail list * plus the list of blocks to be made available when the new checkpoint * completes. We can't merge that second list into the real list yet, * it's not truly available until the new checkpoint locations have been * saved to the metadata. */ if (is_live) { WT_RET(__wt_block_extlist_write( session, block, &ci->avail, &ci->ckpt_avail)); if (ci->avail.offset != WT_BLOCK_INVALID_OFFSET) WT_RET(__wt_block_off_remove_overlap( session, alloc, ci->avail.offset, ci->avail.size)); } /* * Set the file size for the live system. * * XXX * We do NOT set the file size when re-writing checkpoints because we * want to test the checkpoint's blocks against a reasonable maximum * file size during verification. This is bad: imagine a checkpoint * appearing early in the file, re-written, and then the checkpoint * requires blocks at the end of the file, blocks after the listed file * size. If the application opens that checkpoint for writing * (discarding subsequent checkpoints), we would truncate the file to * the early chunk, discarding the re-written checkpoint information. * The alternative, updating the file size has its own problems, in * that case we'd work correctly, but we'd lose all of the blocks * between the original checkpoint and the re-written checkpoint. * Currently, there's no API to roll-forward intermediate checkpoints, * if there ever is, this will need to be fixed. */ if (is_live) WT_RET(__wt_filesize(session, block->fh, &ci->file_size)); /* Set the checkpoint size for the live system. */ if (is_live) ci->ckpt_size = ckpt_size; /* * Copy the checkpoint information into the checkpoint array's address * cookie. */ WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE)); endp = ckpt->raw.mem; WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci)); ckpt->raw.size = WT_PTRDIFF32(endp, ckpt->raw.mem); if (WT_VERBOSE_ISSET(session, ckpt)) { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp)); WT_VERBOSE_ERR(session, ckpt, "%s: create-checkpoint: %s: %s", block->name, ckpt->name, (char *)tmp->data); } err: __wt_scr_free(&tmp); return (ret); }
/* * __ckpt_process -- * Process the list of checkpoints. */ static int __ckpt_process( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) { WT_BLOCK_CKPT *a, *b, *ci; WT_CKPT *ckpt, *next_ckpt; WT_DECL_ITEM(tmp); WT_DECL_RET; uint64_t ckpt_size; int deleting, locked; ci = &block->live; locked = 0; /* * We've allocated our last page, update the checkpoint size. We need * to calculate the live system's checkpoint size before reading and * merging checkpoint allocation and discard information from the * checkpoints we're deleting, those operations change the underlying * byte counts. */ ckpt_size = ci->ckpt_size; ckpt_size += ci->alloc.bytes; ckpt_size -= ci->discard.bytes; /* * Extents newly available as a result of deleting previous checkpoints * are added to a list of extents. The list should be empty, but there * is no explicit "free the checkpoint information" call into the block * manager; if there was an error in an upper level resulting in some * previous checkpoint never being resolved, the list may not be empty. * * XXX * This isn't sufficient, actually: we're going to leak all the blocks * written as part of the last checkpoint because it was never resolved. */ __wt_block_extlist_free(session, &ci->ckpt_avail); WT_RET(__wt_block_extlist_init( session, &ci->ckpt_avail, "live", "ckpt_avail")); /* * To delete a checkpoint, we'll need checkpoint information for it and * the subsequent checkpoint into which it gets rolled; read them from * disk before we lock things down. */ deleting = 0; WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; deleting = 1; /* * Read the checkpoint and next checkpoint extent lists if we * haven't already read them (we may have already read these * extent blocks if there is more than one deleted checkpoint). */ if (ckpt->bpriv == NULL) WT_ERR(__ckpt_extlist_read(session, block, ckpt)); for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * The "next" checkpoint may be the live tree which has no * extent blocks to read. */ if (next_ckpt->bpriv == NULL && !F_ISSET(next_ckpt, WT_CKPT_ADD)) WT_ERR(__ckpt_extlist_read(session, block, next_ckpt)); } /* * Hold a lock so the live extent lists and the file size can't change * underneath us. I suspect we'll tighten this if checkpoints take too * much time away from real work: we read the historic checkpoint * information without a lock, but we could also merge and re-write the * delete checkpoint information without a lock, except for ranges * merged into the live tree. */ __wt_spin_lock(session, &block->live_lock); locked = 1; /* Skip the additional processing if we aren't deleting checkpoints. */ if (!deleting) goto live_update; /* * Delete any no-longer-needed checkpoints: we do this first as it frees * blocks to the live lists, and the freed blocks will then be included * when writing the live extent lists. */ WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; if (WT_VERBOSE_ISSET(session, ckpt)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string( session, block, ckpt->raw.data, tmp)); WT_VERBOSE_ERR(session, ckpt, "%s: delete-checkpoint: %s: %s", block->name, ckpt->name, (char *)tmp->data); } /* * Find the checkpoint into which we'll roll this checkpoint's * blocks: it's the next real checkpoint in the list, and it * better have been read in (if it's not the add slot). */ for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * Set the from/to checkpoint structures, where the "to" value * may be the live tree. */ a = ckpt->bpriv; if (F_ISSET(next_ckpt, WT_CKPT_ADD)) b = &block->live; else b = next_ckpt->bpriv; /* * Free the root page: there's nothing special about this free, * the root page is allocated using normal rules, that is, it * may have been taken from the avail list, and was entered on * the live system's alloc list at that time. We free it into * the checkpoint's discard list, however, not the live system's * list because it appears on the checkpoint's alloc list and so * must be paired in the checkpoint. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) WT_ERR(__wt_block_insert_ext(session, &a->discard, a->root_offset, a->root_size)); /* * Free the blocks used to hold the "from" checkpoint's extent * lists, including the avail list. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard)); /* * Roll the "from" alloc and discard extent lists into the "to" * checkpoint's lists. */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( session, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( session, &a->discard, &b->discard)); /* * If the "to" checkpoint is also being deleted, we're done with * it, it's merged into some other checkpoint in the next loop. * This means the extent lists may aggregate over a number of * checkpoints, but that's OK, they're disjoint sets of ranges. */ if (F_ISSET(next_ckpt, WT_CKPT_DELETE)) continue; /* * Find blocks for re-use: wherever the "to" checkpoint's * allocate and discard lists overlap, move the range to * the live system's checkpoint available list. */ WT_ERR(__wt_block_extlist_overlap(session, block, b)); /* * If we're updating the live system's information, we're done. */ if (F_ISSET(next_ckpt, WT_CKPT_ADD)) continue; /* * We have to write the "to" checkpoint's extent lists out in * new blocks, and update its cookie. * * Free the blocks used to hold the "to" checkpoint's extent * lists; don't include the avail list, it's not changing. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard)); F_SET(next_ckpt, WT_CKPT_UPDATE); } /* Update checkpoints marked for update. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_UPDATE)) { WT_ASSERT(session, !F_ISSET(ckpt, WT_CKPT_ADD)); WT_ERR(__ckpt_update( session, block, ckpt, ckpt->bpriv, 0, 0)); } live_update: ci = &block->live; /* Truncate the file if that's possible. */ WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail)); /* Update the final, added checkpoint based on the live system. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_ADD)) { WT_ERR(__ckpt_update( session, block, ckpt, ci, ckpt_size, 1)); /* * XXX * Our caller wants the final checkpoint size. Setting * the size here violates layering, but the alternative * is a call for the btree layer to crack the checkpoint * cookie into its components, and that's a fair amount * of work. */ ckpt->ckpt_size = ci->ckpt_size; } /* * Reset the live system's alloc and discard extent lists, leave the * avail list alone. */ __wt_block_extlist_free(session, &ci->alloc); WT_ERR(__wt_block_extlist_init(session, &ci->alloc, "live", "alloc")); __wt_block_extlist_free(session, &ci->discard); WT_ERR( __wt_block_extlist_init(session, &ci->discard, "live", "discard")); #ifdef HAVE_DIAGNOSTIC /* * The first checkpoint in the system should always have an empty * discard list. If we've read that checkpoint and/or created it, * check. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (!F_ISSET(ckpt, WT_CKPT_DELETE)) break; if ((a = ckpt->bpriv) == NULL) a = &block->live; if (a->discard.entries != 0) { __wt_errx(session, "first checkpoint incorrectly has blocks on the discard " "list"); WT_ERR(WT_ERROR); } #endif err: if (locked) __wt_spin_unlock(session, &block->live_lock); /* Discard any checkpoint information we loaded. */ WT_CKPT_FOREACH(ckptbase, ckpt) if ((ci = ckpt->bpriv) != NULL) __wt_block_ckpt_destroy(session, ci); __wt_scr_free(&tmp); return (ret); }
/* * __wt_block_snapshot_load -- * Load a snapshot. */ int __wt_block_snapshot_load(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size, int readonly) { WT_BLOCK_SNAPSHOT *si; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_UNUSED(addr_size); /* * Sometimes we don't find a root page (we weren't given a snapshot, * or the referenced snapshot was empty). In that case we return a * root page size of 0. Set that up now. */ dsk->size = 0; si = &block->live; WT_RET(__wt_block_snap_init(session, block, si, "live", 1)); if (WT_VERBOSE_ISSET(session, snapshot)) { if (addr != NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string(session, block, addr, tmp)); } WT_VERBOSE_ERR(session, snapshot, "%s: load-snapshot: %s", block->name, addr == NULL ? "[Empty]" : (char *)tmp->data); } /* If not loading a snapshot from disk, we're done. */ if (addr == NULL || addr_size == 0) return (0); /* Crack the snapshot cookie. */ if (addr != NULL) WT_ERR(__wt_block_buffer_to_snapshot(session, block, addr, si)); /* Verify sets up next. */ if (block->verify) WT_ERR(__wt_verify_snap_load(session, block, si)); /* Read, and optionally verify, any root page. */ if (si->root_offset != WT_BLOCK_INVALID_OFFSET) { WT_ERR(__wt_block_read_off(session, block, dsk, si->root_offset, si->root_size, si->root_cksum)); if (block->verify) { if (tmp == NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string( session, block, addr, tmp)); } WT_ERR( __wt_verify_dsk(session, (char *)tmp->data, dsk)); } } /* * Rolling a snapshot forward requires the avail list, the blocks from * which we can allocate. */ if (!readonly) WT_ERR(__wt_block_extlist_read(session, block, &si->avail)); /* * If the snapshot can be written, that means anything written after * the snapshot is no longer interesting. Truncate the file. */ if (!readonly) { WT_VERBOSE_ERR(session, snapshot, "truncate file to %" PRIuMAX, (uintmax_t)si->file_size); WT_ERR(__wt_ftruncate(session, block->fh, si->file_size)); } if (0) { err: (void)__wt_block_snapshot_unload(session, block); } __wt_scr_free(&tmp); return (ret); }
/* * __wt_lsm_merge_worker -- * The merge worker thread for an LSM tree, responsible for merging * on-disk trees. */ void * __wt_lsm_merge_worker(void *vargs) { WT_DECL_RET; WT_LSM_WORKER_ARGS *args; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; u_int aggressive, chunk_wait, id, old_aggressive, stallms; int progress; args = vargs; lsm_tree = args->lsm_tree; id = args->id; session = lsm_tree->worker_sessions[id]; __wt_free(session, args); aggressive = stallms = 0; while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { /* * Help out with switching chunks in case the checkpoint worker * is busy. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(ret); } progress = 0; /* Clear any state from previous worker thread iterations. */ session->dhandle = NULL; /* Try to create a Bloom filter. */ if (__lsm_bloom_work(session, lsm_tree) == 0) progress = 1; /* If we didn't create a Bloom filter, try to merge. */ if (progress == 0 && __wt_lsm_merge(session, lsm_tree, id, aggressive) == 0) progress = 1; /* Clear any state from previous worker thread iterations. */ WT_CLEAR_BTREE_IN_SESSION(session); /* * Only have one thread freeing old chunks, and only if there * are chunks to free. */ if (id == 0 && lsm_tree->nold_chunks > 0 && __lsm_free_chunks(session, lsm_tree) == 0) progress = 1; if (progress) stallms = 0; else if (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { /* Poll 10 times per second. */ WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, 100000)); stallms += 100; /* * Get aggressive if more than enough chunks for a * merge should have been created while we waited. * Use 10 seconds as a default if we don't have an * estimate. */ chunk_wait = stallms / (lsm_tree->chunk_fill_ms == 0 ? 10000 : lsm_tree->chunk_fill_ms); old_aggressive = aggressive; aggressive = chunk_wait / lsm_tree->merge_min; if (aggressive > old_aggressive) WT_VERBOSE_ERR(session, lsm, "LSM merge got aggressive (%u), " "%u / %" PRIu64, aggressive, stallms, lsm_tree->chunk_fill_ms); } } if (0) { err: __wt_err(session, ret, "LSM merge worker failed"); } return (NULL); }
/* * __lsm_free_chunks -- * Try to drop chunks from the tree that are no longer required. */ static int __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_WORKER_COOKIE cookie; u_int i, skipped; int progress; /* * Take a copy of the current state of the LSM tree and look for chunks * to drop. We do it this way to avoid holding the LSM tree lock while * doing I/O or waiting on the schema lock. * * This is safe because only one thread will be in this function at a * time (the first merge thread). Merges may complete concurrently, * and the old_chunks array may be extended, but we shuffle down the * pointers each time we free one to keep the non-NULL slots at the * beginning of the array. */ WT_CLEAR(cookie); WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 1)); for (i = skipped = 0, progress = 0; i < cookie.nchunks; i++) { chunk = cookie.chunk_array[i]; WT_ASSERT(session, chunk != NULL); /* Skip the chunk if another worker is using it. */ if (chunk->refcnt > 1) { ++skipped; continue; } if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM)) { /* * An EBUSY return is acceptable - a cursor may still * be positioned on this old chunk. */ if ((ret = __lsm_drop_file( session, chunk->bloom_uri)) == EBUSY) { WT_VERBOSE_ERR(session, lsm, "LSM worker bloom drop busy: %s.", chunk->bloom_uri); ++skipped; continue; } else WT_ERR(ret); F_CLR_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM); } if (chunk->uri != NULL) { /* * An EBUSY return is acceptable - a cursor may still * be positioned on this old chunk. */ if ((ret = __lsm_drop_file( session, chunk->uri)) == EBUSY) { WT_VERBOSE_ERR(session, lsm, "LSM worker drop busy: %s.", chunk->uri); ++skipped; continue; } else WT_ERR(ret); } progress = 1; /* Lock the tree to clear out the old chunk information. */ WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); /* * The chunk we are looking at should be the first one in the * tree that we haven't already skipped over. */ WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk); __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, lsm_tree->old_chunks[skipped]); /* Shuffle down to keep all occupied slots at the beginning. */ if (--lsm_tree->nold_chunks > skipped) { memmove(lsm_tree->old_chunks + skipped, lsm_tree->old_chunks + skipped + 1, (lsm_tree->nold_chunks - skipped) * sizeof(WT_LSM_CHUNK *)); lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL; } /* * Clear the chunk in the cookie so we don't attempt to * decrement the reference count. */ cookie.chunk_array[i] = NULL; /* * Update the metadata. We used to try to optimize by only * updating the metadata once at the end, but the error * handling is not straightforward. */ WT_TRET(__wt_lsm_meta_write(session, lsm_tree)); WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); } err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); /* Returning non-zero means there is no work to do. */ if (!progress) WT_TRET(WT_NOTFOUND); return (ret); }
/* * __lsm_bloom_create -- * Create a bloom filter for a chunk of the LSM tree that has been * checkpointed but not yet been merged. */ static int __lsm_bloom_create(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off) { WT_BLOOM *bloom; WT_CURSOR *src; WT_DECL_RET; WT_ITEM buf, key; WT_SESSION *wt_session; uint64_t insert_count; int exist; /* * Normally, the Bloom URI is populated when the chunk struct is * allocated. After an open, however, it may not have been. * Deal with that here. */ if (chunk->bloom_uri == NULL) { WT_CLEAR(buf); WT_RET(__wt_lsm_tree_bloom_name( session, lsm_tree, chunk->id, &buf)); chunk->bloom_uri = __wt_buf_steal(session, &buf, NULL); } /* * Drop the bloom filter first - there may be some content hanging over * from an aborted merge or checkpoint. */ wt_session = &session->iface; WT_RET(__wt_exist(session, chunk->bloom_uri + strlen("file:"), &exist)); if (exist) WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force")); bloom = NULL; /* * This is merge-like activity, and we don't want compacts to give up * because we are creating a bunch of bloom filters before merging. */ ++lsm_tree->merge_progressing; WT_RET(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, chunk->count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); /* Open a special merge cursor just on this chunk. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1)); F_SET(session, WT_SESSION_NO_CACHE); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); WT_ERR(__wt_bloom_insert(bloom, &key)); } WT_ERR_NOTFOUND_OK(ret); WT_TRET(src->close(src)); WT_TRET(__wt_bloom_finalize(bloom)); WT_ERR(ret); F_CLR(session, WT_SESSION_NO_CACHE); /* Load the new Bloom filter into cache. */ WT_CLEAR(key); WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); WT_VERBOSE_ERR(session, lsm, "LSM worker created bloom filter %s. " "Expected %" PRIu64 " items, got %" PRIu64, chunk->bloom_uri, chunk->count, insert_count); /* Ensure the bloom filter is in the metadata. */ WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM bloom worker metadata write"); err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); F_CLR(session, WT_SESSION_NO_CACHE); return (ret); }
/* * __wt_lsm_checkpoint_worker -- * A worker thread for an LSM tree, responsible for flushing new chunks to * disk. */ void * __wt_lsm_checkpoint_worker(void *arg) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_LSM_WORKER_COOKIE cookie; WT_SESSION_IMPL *session; WT_TXN_ISOLATION saved_isolation; u_int i, j; int locked; lsm_tree = arg; session = lsm_tree->ckpt_session; WT_CLEAR(cookie); while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(ret); } WT_ERR(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); /* Write checkpoints in all completed files. */ for (i = 0, j = 0; i < cookie.nchunks - 1; i++) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) goto err; if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; chunk = cookie.chunk_array[i]; /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session); if (!__wt_txn_visible_all(session, chunk->txnid_max)) break; /* * If the chunk is already checkpointed, make sure it * is also evicted. Either way, there is no point * trying to checkpoint it again. */ if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK)) { if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_EVICTED)) continue; if ((ret = __lsm_discard_handle( session, chunk->uri, NULL)) == 0) F_SET_ATOMIC( chunk, WT_LSM_CHUNK_EVICTED); else if (ret == EBUSY) ret = 0; else WT_ERR_MSG(session, ret, "discard handle"); continue; } WT_VERBOSE_ERR(session, lsm, "LSM worker flushing %u", i); /* * Flush the file before checkpointing: this is the * expensive part in terms of I/O: do it without * holding the schema lock. * * Use the special eviction isolation level to avoid * interfering with an application checkpoint: we have * already checked that all of the updates in this * chunk are globally visible. * * !!! We can wait here for checkpoints and fsyncs to * complete, which can be a long time. * * Don't keep waiting for the lock if application * threads are waiting for a switch. Don't skip * flushing the leaves either: that just means we'll * hold the schema lock for (much) longer, which blocks * the world. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); for (locked = 0; !locked && ret == 0 && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);) { if ((ret = __wt_spin_trylock(session, &S2C(session)->checkpoint_lock)) == 0) locked = 1; else if (ret == EBUSY) { __wt_yield(); ret = 0; } } if (locked) { saved_isolation = session->txn.isolation; session->txn.isolation = TXN_ISO_EVICTION; ret = __wt_bt_cache_op( session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; __wt_spin_unlock( session, &S2C(session)->checkpoint_lock); } WT_TRET(__wt_session_release_btree(session)); WT_ERR(ret); if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointing %u", i); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint"); break; } WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* * Clear the "cache resident" flag so the primary can * be evicted and eventually closed. Only do this once * the checkpoint has succeeded: otherwise, accessing * the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, 1); WT_ERR(__wt_session_release_btree(session)); ++j; WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree); WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint metadata write"); break; } WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointed %u", i); } __lsm_unpin_chunks(session, &cookie); if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, 100000)); } err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); /* * The thread will only exit with failure if we run out of memory or * there is some other system driven failure. We can't keep going * after such a failure - ensure WiredTiger shuts down. */ if (ret != 0 && ret != WT_NOTFOUND) WT_PANIC_ERR(session, ret, "Shutting down LSM checkpoint utility thread"); return (NULL); }
/* * __wt_verify -- * Verify a file. */ int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_CKPT *ckptbase, *ckpt; WT_DECL_RET; WT_VSTUFF *vs, _vstuff; uint32_t root_addr_size; uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; btree = S2BT(session); bm = btree->bm; ckptbase = NULL; WT_CLEAR(_vstuff); vs = &_vstuff; WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key)); WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2)); /* Check configuration strings. */ WT_ERR(__verify_config(session, cfg, vs)); /* Get a list of the checkpoints for this file. */ WT_ERR( __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase)); /* Inform the underlying block manager we're verifying. */ WT_ERR(bm->verify_start(bm, session, ckptbase)); /* Loop through the file's checkpoints, verifying each one. */ WT_CKPT_FOREACH(ckptbase, ckpt) { WT_VERBOSE_ERR(session, verify, "%s: checkpoint %s", btree->dhandle->name, ckpt->name); #ifdef HAVE_DIAGNOSTIC if (vs->dump_address || vs->dump_blocks || vs->dump_pages) WT_ERR(__wt_msg(session, "%s: checkpoint %s", btree->dhandle->name, ckpt->name)); #endif /* Fake checkpoints require no work. */ if (F_ISSET(ckpt, WT_CKPT_FAKE)) continue; /* House-keeping between checkpoints. */ __verify_checkpoint_reset(vs); /* Load the checkpoint, ignore trees with no root page. */ WT_ERR(bm->checkpoint_load(bm, session, ckpt->raw.data, ckpt->raw.size, root_addr, &root_addr_size, 1)); if (root_addr_size != 0) { /* Verify then discard the checkpoint from the cache. */ if ((ret = __wt_btree_tree_open( session, root_addr, root_addr_size)) == 0) { ret = __verify_tree( session, btree->root_page, vs); WT_TRET(__wt_bt_cache_op( session, NULL, WT_SYNC_DISCARD_NOWRITE)); } } /* Unload the checkpoint. */ WT_TRET(bm->checkpoint_unload(bm, session)); WT_ERR(ret); }
/* * __wt_block_salvage_next -- * Return the address for the next potential block from the file. */ int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, uint32_t *addr_sizep, int *eofp) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_FH *fh; off_t max, offset; uint32_t allocsize, cksum, size; uint8_t *endp; *eofp = 0; fh = block->fh; allocsize = block->allocsize; WT_ERR(__wt_scr_alloc(session, allocsize, &tmp)); /* Read through the file, looking for pages. */ for (max = fh->size;;) { offset = block->slvg_off; if (offset >= max) { /* Check eof. */ *eofp = 1; goto done; } /* * Read the start of a possible page (an allocation-size block), * and get a page length from it. Move to the next allocation * sized boundary, we'll never consider this one again. */ WT_ERR(__wt_read(session, fh, offset, allocsize, tmp->mem)); blk = WT_BLOCK_HEADER_REF(tmp->mem); block->slvg_off += allocsize; /* * The page can't be more than the min/max page size, or past * the end of the file. */ size = blk->disk_size; cksum = blk->cksum; if (size == 0 || size % allocsize != 0 || size > WT_BTREE_PAGE_SIZE_MAX || offset + (off_t)size > max) goto skip; /* * The block size isn't insane, read the entire block. Reading * the block validates the checksum; if reading the block fails, * ignore it. If reading the block succeeds, return its address * as a possible page. */ if (__wt_block_read_off( session, block, tmp, offset, size, cksum) == 0) break; skip: WT_VERBOSE_ERR(session, salvage, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); /* Free the allocation-size block. */ WT_ERR(__wt_block_off_free( session, block, offset, (off_t)allocsize)); } /* Re-create the address cookie that should reference this block. */ endp = addr; WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); *addr_sizep = WT_PTRDIFF32(endp, addr); done: err: __wt_scr_free(&tmp); return (ret); }
/* * __snapshot_process -- * Process the list of snapshots. */ static int __snapshot_process( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase) { WT_BLOCK_SNAPSHOT *a, *b, *si; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_SNAPSHOT *snap; uint64_t snapshot_size; int deleting, locked; si = &block->live; locked = 0; /* * We've allocated our last page, update the snapshot size. We need to * calculate the live system's snapshot size before reading and merging * snapshot allocation and discard information from the snapshots we're * deleting, those operations will change the underlying byte counts. */ snapshot_size = si->snapshot_size; snapshot_size += si->alloc.bytes; snapshot_size -= si->discard.bytes; /* * Extents that become newly available as a result of deleting previous * snapshots are added to a list of extents. The list should be empty, * but there's no explicit "free the snapshot information" call into the * block manager; if there was an error in an upper level resulting in * the snapshot never being "resolved", the list might not be empty. * * XXX * This isn't sufficient, actually: we're going to leak all the blocks * that were written as part of the last snapshot because it was never * resolved. */ __wt_block_extlist_free(session, &si->snapshot_avail); WT_RET(__wt_block_extlist_init( session, &si->snapshot_avail, "live", "snapshot_avail")); /* * To delete a snapshot, we'll need snapshot information for it, and we * have to read that from the disk. */ deleting = 0; WT_SNAPSHOT_FOREACH(snapbase, snap) { /* * To delete a snapshot, we'll need snapshot information for it * and the subsequent snapshot. The test is tricky, we have to * load the current snapshot's information if it's marked for * deletion, or if it follows a snapshot marked for deletion, * where the boundary cases are the first snapshot in the list * and the last snapshot in the list: if we're deleting the last * snapshot in the list, there's no next snapshot, the snapshot * will be merged into the live tree. */ if (!F_ISSET(snap, WT_SNAP_DELETE) && (snap == snapbase || F_ISSET(snap, WT_SNAP_ADD) || !F_ISSET(snap - 1, WT_SNAP_DELETE))) continue; deleting = 1; /* * Allocate a snapshot structure, crack the cookie and read the * snapshot's extent lists. * * Ignore the avail list: snapshot avail lists are only useful * if we are rolling forward from the particular snapshot and * they represent our best understanding of what blocks can be * allocated. If we are not operating on the live snapshot, * subsequent snapshots might have allocated those blocks, and * the avail list is useless. We don't discard it, because it * is useful as part of verification, but we don't re-write it * either. */ WT_ERR(__wt_calloc( session, 1, sizeof(WT_BLOCK_SNAPSHOT), &snap->bpriv)); si = snap->bpriv; WT_ERR(__wt_block_snap_init(session, block, si, snap->name, 0)); WT_ERR(__wt_block_buffer_to_snapshot( session, block, snap->raw.data, si)); WT_ERR(__wt_block_extlist_read(session, block, &si->alloc)); WT_ERR(__wt_block_extlist_read(session, block, &si->discard)); } /* * Hold a lock so the live extent lists and the file size can't change * underneath us. I suspect we'll tighten this if snapshots take too * much time away from real work: we read historic snapshot information * without a lock, but we could also merge and re-write the delete * snapshot information without a lock, except for ranges merged into * the live tree. */ __wt_spin_lock(session, &block->live_lock); locked = 1; /* Skip the additional processing if we aren't deleting snapshots. */ if (!deleting) goto live_update; /* * Delete any no-longer-needed snapshots: we do this first as it frees * blocks to the live lists, and the freed blocks will then be included * when writing the live extent lists. */ WT_SNAPSHOT_FOREACH(snapbase, snap) { if (!F_ISSET(snap, WT_SNAP_DELETE)) continue; if (WT_VERBOSE_ISSET(session, snapshot)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string( session, block, snap->raw.data, tmp)); WT_VERBOSE_ERR(session, snapshot, "%s: delete-snapshot: %s: %s", block->name, snap->name, (char *)tmp->data); } /* * Set the from/to snapshot structures, where the "to" value * may be the live tree. */ a = snap->bpriv; if (F_ISSET(snap + 1, WT_SNAP_ADD)) b = &block->live; else b = (snap + 1)->bpriv; /* * Free the root page: there's nothing special about this free, * the root page is allocated using normal rules, that is, it * may have been taken from the avail list, and was entered on * the live system's alloc list at that time. We free it into * the snapshot's discard list, however, not the live system's * list because it appears on the snapshot's alloc list and so * must be paired in the snapshot. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) WT_ERR(__wt_block_insert_ext(session, &a->discard, a->root_offset, a->root_size)); /* * Free the blocks used to hold the "from" snapshot's extent * lists directly to the live system's avail list, they were * never on any alloc list. Include the "from" snapshot's * avail list, it's going away. */ WT_ERR(__snapshot_extlist_fblocks(session, block, &a->alloc)); WT_ERR(__snapshot_extlist_fblocks(session, block, &a->avail)); WT_ERR(__snapshot_extlist_fblocks(session, block, &a->discard)); /* * Roll the "from" alloc and discard extent lists into the "to" * snapshot's lists. */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( session, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( session, &a->discard, &b->discard)); /* * If the "to" snapshot is also being deleted, we're done with * it, it's merged into some other snapshot in the next loop. * This means the extent lists may aggregate over a number of * snapshots, but that's OK, they're disjoint sets of ranges. */ if (F_ISSET(snap + 1, WT_SNAP_DELETE)) continue; /* * Find blocks for re-use: wherever the "to" snapshot's allocate * and discard lists overlap is fair game, move ranges appearing * on both lists to the live snapshot's newly available list. */ WT_ERR(__wt_block_extlist_overlap(session, block, b)); /* * If we're updating the live system's information, we're done. */ if (F_ISSET(snap + 1, WT_SNAP_ADD)) continue; /* * We have to write the "to" snapshot's extent lists out in new * blocks, and update its cookie. * * Free the blocks used to hold the "to" snapshot's extent lists * directly to the live system's avail list, they were never on * any alloc list. Do not include the "to" snapshot's avail * list, it's not changing. */ WT_ERR(__snapshot_extlist_fblocks(session, block, &b->alloc)); WT_ERR(__snapshot_extlist_fblocks(session, block, &b->discard)); F_SET(snap + 1, WT_SNAP_UPDATE); } /* Update snapshots marked for update. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (F_ISSET(snap, WT_SNAP_UPDATE)) { WT_ASSERT(session, !F_ISSET(snap, WT_SNAP_ADD)); WT_ERR(__snapshot_update( session, block, snap, snap->bpriv, 0, 0)); } live_update: si = &block->live; /* Truncate the file if that's possible. */ WT_ERR(__wt_block_extlist_truncate(session, block, &si->avail)); /* Update the final, added snapshot based on the live system. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (F_ISSET(snap, WT_SNAP_ADD)) { WT_ERR(__snapshot_update( session, block, snap, si, snapshot_size, 1)); /* * XXX * Our caller wants two pieces of information: the time * the snapshot was taken and the final snapshot size. * This violates layering but the alternative is a call * for the btree layer to crack the snapshot cookie into * its components, and that's a fair amount of work. * (We could just read the system time in the session * layer when updating the metadata file, but that won't * work for the snapshot size, and so we do both here.) */ snap->snapshot_size = si->snapshot_size; WT_ERR(__wt_epoch(session, &snap->sec, NULL)); } /* * Reset the live system's alloc and discard extent lists, leave the * avail list alone. */ __wt_block_extlist_free(session, &si->alloc); WT_ERR(__wt_block_extlist_init(session, &si->alloc, "live", "alloc")); __wt_block_extlist_free(session, &si->discard); WT_ERR( __wt_block_extlist_init(session, &si->discard, "live", "discard")); #ifdef HAVE_DIAGNOSTIC /* * The first snapshot in the system should always have an empty discard * list. If we've read that snapshot and/or created it, check. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (!F_ISSET(snap, WT_SNAP_DELETE)) break; if ((a = snap->bpriv) == NULL) a = &block->live; if (a->discard.entries != 0) { __wt_errx(session, "snapshot incorrectly has blocks on the discard list"); WT_ERR(WT_ERROR); } #endif err: if (locked) __wt_spin_unlock(session, &block->live_lock); /* Discard any snapshot information we loaded, we no longer need it. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if ((si = snap->bpriv) != NULL) { __wt_block_extlist_free(session, &si->alloc); __wt_block_extlist_free(session, &si->avail); __wt_block_extlist_free(session, &si->discard); } __wt_scr_free(&tmp); return (ret); }
/* * __wt_merge_tree -- * Attempt to collapse a stack of split-merge pages in memory into a * shallow tree. If enough keys are found, create a real internal node * that can be evicted (and, if necessary, split further). * * This code is designed to deal with workloads that otherwise create * arbitrarily deep (and slow) trees in memory. */ int __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) { WT_DECL_RET; WT_PAGE *lchild, *newtop, *rchild; WT_REF *newref; WT_VISIT_STATE visit_state; uint32_t refcnt, split; int promote; u_int levels; uint8_t page_type; WT_CLEAR(visit_state); visit_state.session = session; lchild = newtop = rchild = NULL; page_type = top->type; WT_ASSERT(session, __wt_btree_mergeable(top)); WT_ASSERT(session, top->ref->state == WT_REF_LOCKED); /* * Walk the subtree, count the references at the bottom level and * calculate the maximum depth. */ WT_RET(__merge_walk(session, top, 1, __merge_count, &visit_state)); /* If there aren't enough useful levels, give up. */ if (visit_state.maxdepth < WT_MERGE_STACK_MIN) return (EBUSY); /* * Don't allow split merges to generate arbitrarily large pages. * Ideally we would choose a size based on the internal_page_max * setting for the btree, but we don't have the correct btree handle * available. */ if (visit_state.refcnt > WT_MERGE_MAX_REFS) return (EBUSY); /* * Now we either collapse the internal pages into one split-merge page, * or if there are "enough" keys, we split into two equal internal * pages, each of which can be evicted independently. * * We set a flag (WT_PM_REC_SPLIT_MERGE) on the created page if it * isn't big enough to justify the cost of evicting it. If splits * continue, it will be merged again until it gets over this limit. */ promote = 0; refcnt = (uint32_t)visit_state.refcnt; if (refcnt >= WT_MERGE_FULL_PAGE && visit_state.seen_live) { /* * In the normal case where there are live children spread * through the subtree, create two child pages. * * Handle the case where the only live child is first / last * specially: put the live child into the top-level page. * * Set SPLIT_MERGE on the internal pages if there are any live * children: they can't be evicted, so there is no point * permanently deepening the tree. */ if (visit_state.first_live == visit_state.last_live && (visit_state.first_live == 0 || visit_state.first_live == refcnt - 1)) split = (visit_state.first_live == 0) ? 1 : refcnt - 1; else split = (refcnt + 1) / 2; /* Only promote if we can create a real page. */ if (split == 1 || split == refcnt - 1) promote = 1; else if (split >= WT_MERGE_FULL_PAGE && visit_state.first_live >= split) promote = 1; else if (refcnt - split >= WT_MERGE_FULL_PAGE && visit_state.last_live < split) promote = 1; } if (promote) { /* Create a new top-level split-merge page with two entries. */ WT_ERR(__merge_new_page(session, page_type, 2, 1, &newtop)); visit_state.split = split; /* Left split. */ if (split == 1) visit_state.first = newtop; else { WT_ERR(__merge_new_page(session, page_type, split, visit_state.first_live < split, &lchild)); visit_state.first = lchild; } /* Right split. */ if (split == refcnt - 1) { visit_state.second = newtop; visit_state.second_ref = &newtop->u.intl.t[1]; } else { WT_ERR(__merge_new_page(session, page_type, refcnt - split, visit_state.last_live >= split, &rchild)); visit_state.second = rchild; visit_state.second_ref = &visit_state.second->u.intl.t[0]; } } else { /* * Create a new split-merge page for small merges, or if the * page above is a split merge page. When we do a big enough * merge, we create a real page at the top and don't consider * it as a merge candidate again. Over time with an insert * workload the tree will grow deeper, but that's inevitable, * and this keeps individual merges small. */ WT_ERR(__merge_new_page(session, page_type, refcnt, refcnt < WT_MERGE_FULL_PAGE || __wt_btree_mergeable(top->parent), &newtop)); visit_state.first = newtop; } /* * Copy the references into the new tree, but don't update anything in * the locked tree in case there is an error and we need to back out. * We do this in a separate pass so that we can figure out the key for * the split point: that allocates memory and so it could still fail. */ visit_state.page = visit_state.first; visit_state.ref = visit_state.page->u.intl.t; visit_state.refcnt = 0; WT_ERR(__merge_walk(session, top, 0, __merge_copy_ref, &visit_state)); if (promote) { /* Promote keys into the top-level page. */ if (lchild != NULL) { newref = &newtop->u.intl.t[0]; WT_LINK_PAGE(newtop, newref, lchild); newref->state = WT_REF_MEM; WT_ERR(__merge_promote_key(session, newref)); } if (rchild != NULL) { newref = &newtop->u.intl.t[1]; WT_LINK_PAGE(newtop, newref, rchild); newref->state = WT_REF_MEM; WT_ERR(__merge_promote_key(session, newref)); } } /* * We have copied everything into place and allocated all of the memory * we need. Now link all pages into the new tree and unlock them. * * The only way this could fail is if a reference state has been * changed by another thread since they were locked. Panic in that * case: that should never happen. */ visit_state.page = visit_state.first; visit_state.ref = visit_state.page->u.intl.t; visit_state.refcnt = 0; ret = __merge_walk(session, top, 0, __merge_switch_page, &visit_state); if (ret != 0) WT_ERR(__wt_illegal_value(session, "__wt_merge_tree")); newtop->u.intl.recno = top->u.intl.recno; newtop->parent = top->parent; newtop->ref = top->ref; #ifdef HAVE_DIAGNOSTIC /* * Before swapping in the new tree, walk the pages we are discarding, * check that everything looks right. */ __merge_check_discard(session, top); #endif /* * Set up the new top-level page as a split so that it will be swapped * into place by our caller. */ top->modify->flags = WT_PM_REC_SPLIT; top->modify->u.split = newtop; WT_VERBOSE_ERR(session, evict, "Successfully %s %" PRIu32 " split-merge pages containing %" PRIu32 " keys\n", promote ? "promoted" : "merged", visit_state.maxdepth, refcnt); /* Evict new child pages as soon as possible. */ if (lchild != NULL && !F_ISSET(lchild->modify, WT_PM_REC_SPLIT_MERGE)) lchild->read_gen = WT_READ_GEN_OLDEST; if (rchild != NULL && !F_ISSET(rchild->modify, WT_PM_REC_SPLIT_MERGE)) rchild->read_gen = WT_READ_GEN_OLDEST; /* Update statistics. */ WT_CSTAT_INCR(session, cache_eviction_merge); WT_DSTAT_INCR(session, cache_eviction_merge); /* How many levels did we remove? */ levels = visit_state.maxdepth - (promote ? 2 : 1); WT_CSTAT_INCRV(session, cache_eviction_merge_levels, levels); WT_DSTAT_INCRV(session, cache_eviction_merge_levels, levels); return (0); err: WT_VERBOSE_TRET(session, evict, "Failed to merge %" PRIu32 " split-merge pages containing %" PRIu32 " keys\n", visit_state.maxdepth, refcnt); WT_CSTAT_INCR(session, cache_eviction_merge_fail); WT_DSTAT_INCR(session, cache_eviction_merge_fail); if (newtop != NULL) __wt_page_out(session, &newtop); if (lchild != NULL) __wt_page_out(session, &lchild); if (rchild != NULL) __wt_page_out(session, &rchild); return (ret); }
/* * __snapshot_update -- * Update a snapshot. */ static int __snapshot_update( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snap, WT_BLOCK_SNAPSHOT *si, uint64_t snapshot_size, int is_live) { WT_DECL_ITEM(tmp); WT_DECL_RET; uint8_t *endp; #ifdef HAVE_DIAGNOSTIC /* Check the extent list combinations for overlaps. */ WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->avail)); WT_RET(__wt_block_extlist_check(session, &si->discard, &si->avail)); WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->discard)); #endif /* * Write the snapshot's extent lists; we only write an avail list for * the live system, other snapshot's avail lists are static and never * change. When we do write the avail list for the live system it's * two lists: the current avail list plus the list of blocks that are * being made available as of the new snapshot. We can't merge that * second list into the real list yet, it's not truly available until * the new snapshot location has been saved to the metadata. */ WT_RET(__wt_block_extlist_write(session, block, &si->alloc, NULL)); if (is_live) WT_RET(__wt_block_extlist_write( session, block, &si->avail, &si->snapshot_avail)); WT_RET(__wt_block_extlist_write(session, block, &si->discard, NULL)); /* * Set the file size for the live system. * * XXX * We do NOT set the file size when re-writing snapshots because we want * to test the snapshot's blocks against a reasonable maximum file size * during verification. This is not good: imagine a snapshot appearing * early in the file, re-written, and then the snapshot requires blocks * at the end of the file, blocks after the listed file size. If the * application opens that snapshot for writing (discarding subsequent * snapshots), we would truncate the file to the early chunk, discarding * the re-written snapshot information. The alternative, updating the * file size has its own problems, in that case we'd work correctly, but * we'd lose all of the blocks between the original snapshot and the * re-written snapshot. Currently, there's no API to roll-forward * intermediate snapshots, if there ever is, this will need to be fixed. */ if (is_live) WT_RET(__wt_filesize(session, block->fh, &si->file_size)); /* Set the snapshot size for the live system. */ if (is_live) si->snapshot_size = snapshot_size; /* * Copy the snapshot information into the snapshot array's address * cookie. */ WT_RET(__wt_buf_init(session, &snap->raw, WT_BTREE_MAX_ADDR_COOKIE)); endp = snap->raw.mem; WT_RET(__wt_block_snapshot_to_buffer(session, block, &endp, si)); snap->raw.size = WT_PTRDIFF32(endp, snap->raw.mem); if (WT_VERBOSE_ISSET(session, snapshot)) { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string(session, block, snap->raw.data, tmp)); WT_VERBOSE_ERR(session, snapshot, "%s: create-snapshot: %s: %s", block->name, snap->name, (char *)tmp->data); } err: __wt_scr_free(&tmp); return (ret); }