/* * __wt_cond_signal -- * Signal a waiting thread. */ int __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) { WT_DECL_RET; int locked; locked = 0; /* * !!! * This function MUST handle a NULL session handle. */ if (session != NULL && WT_VERBOSE_ISSET(session, mutex)) WT_RET(__wt_verbose( session, "signal %s cond (%p)", cond->name, cond)); /* Fast path if already signalled. */ if (cond->waiters == -1) return (0); if (cond->waiters > 0 || !WT_ATOMIC_CAS(cond->waiters, 0, -1)) { WT_ERR(pthread_mutex_lock(&cond->mtx)); locked = 1; WT_ERR(pthread_cond_broadcast(&cond->cond)); } err: if (locked) WT_TRET(pthread_mutex_unlock(&cond->mtx)); if (ret == 0) return (0); WT_RET_MSG(session, ret, "pthread_cond_broadcast"); }
/* * __wt_rec_track_init -- * Initialize the page's list of tracked objects when reconciliation * starts. */ int __wt_rec_track_init(WT_SESSION_IMPL *session, WT_PAGE *page) { if (WT_VERBOSE_ISSET(session, reconcile)) WT_RET(__track_dump(session, page, "reconcile init")); return (0); }
/* * __wt_ovfl_reuse_search -- * Search the page's list of overflow records for a match. */ int __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, uint8_t **addrp, size_t *addr_sizep, const void *value, size_t value_size) { WT_OVFL_REUSE **head, *reuse; *addrp = NULL; *addr_sizep = 0; if (page->modify->ovfl_track == NULL) return (0); head = page->modify->ovfl_track->ovfl_reuse; /* * The search function returns the first matching record in the list * which does not have the in-use flag set, or NULL. */ if ((reuse = __ovfl_reuse_skip_search(head, value, value_size)) == NULL) return (0); *addrp = WT_OVFL_REUSE_ADDR(reuse); *addr_sizep = reuse->addr_size; F_SET(reuse, WT_OVFL_REUSE_INUSE); if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET(__ovfl_reuse_verbose(session, page, reuse, "reclaim")); return (1); }
/* * __wt_cond_signal -- * Signal a waiting thread. */ int __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) { WT_DECL_RET; int locked; locked = 0; /* * !!! * This function MUST handle a NULL session handle. */ if (session != NULL && WT_VERBOSE_ISSET(session, mutex)) WT_RET(__wt_verbose( session, "signal %s cond (%p)", cond->name, cond)); WT_ERR(pthread_mutex_lock(&cond->mtx)); locked = 1; if (!cond->signalled) { cond->signalled = 1; WT_ERR(pthread_cond_signal(&cond->cond)); } err: if (locked) WT_TRET(pthread_mutex_unlock(&cond->mtx)); if (ret == 0) return (0); WT_RET_MSG(session, ret, "pthread_cond_signal"); }
/* * __wt_ovfl_txnc_add -- * Add a new entry to the page's list of transaction-cached overflow * records. */ int __wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, size_t addr_size, const void *value, size_t value_size) { WT_OVFL_TXNC **head, **stack[WT_SKIP_MAXDEPTH], *txnc; size_t size; u_int i, skipdepth; uint8_t *p; if (page->modify->ovfl_track == NULL) WT_RET(__ovfl_track_init(session, page)); head = page->modify->ovfl_track->ovfl_txnc; /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(session); /* * Allocate the WT_OVFL_TXNC structure, next pointers for the skip * list, room for the address and value, then copy everything into * place. * * To minimize the WT_OVFL_TXNC structure size, the address offset * and size are single bytes: that's safe because the address follows * the structure (which can't be more than about 100B), and address * cookies are limited to 255B. */ size = sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *) + addr_size + value_size; WT_RET(__wt_calloc(session, 1, size, &txnc)); p = (uint8_t *)txnc + sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *); txnc->addr_offset = (uint8_t)WT_PTRDIFF(p, txnc); txnc->addr_size = (uint8_t)addr_size; memcpy(p, addr, addr_size); p += addr_size; txnc->value_offset = WT_PTRDIFF32(p, txnc); txnc->value_size = WT_STORE_SIZE(value_size); memcpy(p, value, value_size); txnc->current = __wt_txn_new_id(session); __wt_cache_page_inmem_incr( session, page, WT_OVFL_SIZE(txnc, WT_OVFL_TXNC)); /* Insert the new entry into the skiplist. */ __ovfl_txnc_skip_search_stack(head, stack, addr, addr_size); for (i = 0; i < skipdepth; ++i) { txnc->next[i] = *stack[i]; *stack[i] = txnc; } if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET(__ovfl_txnc_verbose(session, page, txnc, "add")); return (0); }
/* * __recovery_set_checkpoint_timestamp -- * Set the checkpoint timestamp as retrieved from the metadata file. */ static int __recovery_set_checkpoint_timestamp(WT_RECOVERY *r) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; wt_timestamp_t ckpt_timestamp; char ts_string[WT_TS_INT_STRING_SIZE], *sys_config; sys_config = NULL; session = r->session; conn = S2C(session); /* * Read the system checkpoint information from the metadata file and * save the stable timestamp of the last checkpoint for later query. * This gets saved in the connection. */ ckpt_timestamp = 0; /* Search in the metadata for the system information. */ WT_ERR_NOTFOUND_OK( __wt_metadata_search(session, WT_SYSTEM_CKPT_URI, &sys_config)); if (sys_config != NULL) { WT_CLEAR(cval); WT_ERR_NOTFOUND_OK(__wt_config_getones( session, sys_config, "checkpoint_timestamp", &cval)); if (cval.len != 0) { __wt_verbose(session, WT_VERB_RECOVERY, "Recovery timestamp %.*s", (int)cval.len, cval.str); WT_ERR(__wt_txn_parse_timestamp_raw(session, "recovery", &ckpt_timestamp, &cval)); } } /* * Set the recovery checkpoint timestamp and the metadata checkpoint * timestamp so that the checkpoint after recovery writes the correct * value into the metadata. */ conn->txn_global.meta_ckpt_timestamp = conn->txn_global.recovery_timestamp = ckpt_timestamp; if (WT_VERBOSE_ISSET(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS)) { __wt_timestamp_to_string( conn->txn_global.recovery_timestamp, ts_string); __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS, "Set global recovery timestamp: %s", ts_string); } err: __wt_free(session, sys_config); return (ret); }
/* * __wt_block_compact_skip -- * Return if compaction will shrink the file. */ int __wt_block_compact_skip(WT_SESSION_IMPL *session, WT_BLOCK *block, int *skipp) { WT_DECL_RET; WT_EXT *ext; WT_EXTLIST *el; WT_FH *fh; off_t avail, ninety; *skipp = 1; /* Return a default skip. */ fh = block->fh; /* * We do compaction by copying blocks from the end of the file to the * beginning of the file, and we need some metrics to decide if it's * worth doing. Ignore small files, and files where we are unlikely * to recover 10% of the file. */ if (fh->size <= 10 * 1024) return (0); __wt_spin_lock(session, &block->live_lock); if (WT_VERBOSE_ISSET(session, compact)) WT_ERR(__block_dump_avail(session, block)); /* Sum the number of available bytes in the first 90% of the file. */ avail = 0; ninety = fh->size - fh->size / 10; el = &block->live.avail; WT_EXT_FOREACH(ext, el->off) if (ext->off < ninety) avail += ext->size; /* * If at least 10% of the total file is available and in the first 90% * of the file, we'll try compaction. */ if (avail >= fh->size / 10) *skipp = 0; WT_VERBOSE_ERR(session, compact, "%s: %" PRIuMAX "MB (%" PRIuMAX ") available space in the first " "90%% of the file, require 10%% or %" PRIuMAX "MB (%" PRIuMAX ") to perform compaction, compaction %s", block->name, (uintmax_t)avail / WT_MEGABYTE, (uintmax_t)avail, (uintmax_t)(fh->size / 10) / WT_MEGABYTE, (uintmax_t)fh->size / 10, *skipp ? "skipped" : "proceeding"); err: __wt_spin_unlock(session, &block->live_lock); return (ret); }
/* * __wt_rec_track_ovfl_reuse -- * Search for a matching overflow record and reactivate it. */ int __wt_rec_track_ovfl_reuse( WT_SESSION_IMPL *session, WT_PAGE *page, const void *data, uint32_t data_size, uint8_t **addrp, uint32_t *addr_sizep, int *foundp) { WT_PAGE_MODIFY *mod; WT_PAGE_TRACK *track; uint32_t i; *foundp = 0; mod = page->modify; for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i) { /* Ignore empty slots */ if (!F_ISSET(track, WT_TRK_OBJECT)) continue; /* * Ignore discarded objects, objects already in-use, or cached * overflow values. We don't care about whether or not the * object came from a page, we can re-use objects from the page * or objects created in a previous reconciliation. */ if (F_ISSET(track, WT_TRK_DISCARD | WT_TRK_INUSE | WT_TRK_OVFL_VALUE)) continue; /* * Ignore objects without data (must be block objects). This is * not really necessary (presumably, our caller is matching on a * non-zero-length data item), but paranoia is healthy. */ if (track->data == NULL) continue; /* Check to see if the data matches. */ if (track->size != data_size || memcmp(data, track->data, data_size) != 0) continue; /* * Reactivate the record. * Return the block addr/size pair to our caller. */ F_SET(track, WT_TRK_INUSE); *addrp = track->addr.addr; *addr_sizep = track->addr.size; *foundp = 1; if (WT_VERBOSE_ISSET(session, reconcile)) WT_RET(__track_msg( session, page, "reactivate overflow", track)); return (0); } return (0); }
/* * __ovfl_txnc_wrapup -- * Resolve the page's transaction-cache list. */ static int __ovfl_txnc_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_OVFL_TXNC **e, **head, *txnc; uint64_t oldest_txn; size_t decr; int i; head = page->modify->ovfl_track->ovfl_txnc; /* * Take a snapshot of the oldest transaction ID we need to keep alive. * Since we do two passes through entries in the structure, the normal * visibility check could give different results as the global ID moves * forward. */ oldest_txn = __wt_txn_oldest_id(session); /* * Discard any transaction-cache records with transaction IDs earlier * than any in the system. * * First, walk the overflow transaction-cache skip lists (except for * the lowest level), fixing up links. */ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) for (e = &head[i]; (txnc = *e) != NULL;) { if (WT_TXNID_LE(oldest_txn, txnc->current)) { e = &txnc->next[i]; continue; } *e = txnc->next[i]; } /* Second, discard any no longer needed transaction-cache records. */ decr = 0; for (e = &head[0]; (txnc = *e) != NULL;) { if (WT_TXNID_LE(oldest_txn, txnc->current)) { e = &txnc->next[0]; continue; } *e = txnc->next[0]; if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET( __ovfl_txnc_verbose(session, page, txnc, "free")); decr += WT_OVFL_SIZE(txnc, WT_OVFL_TXNC); __wt_free(session, txnc); } if (decr != 0) __wt_cache_page_inmem_decr(session, page, decr); return (0); }
/* * __ovfl_reuse_wrapup_err -- * Resolve the page's overflow reuse list after an error occurs. */ static int __ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BM *bm; WT_DECL_RET; WT_OVFL_REUSE **e, **head, *reuse; size_t decr; int i; bm = S2BT(session)->bm; head = page->modify->ovfl_track->ovfl_reuse; /* * Discard any overflow records that were just added, freeing underlying * blocks. * * First, walk the overflow reuse lists (except for the lowest one), * fixing up skiplist links. */ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) for (e = &head[i]; (reuse = *e) != NULL;) { if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) { e = &reuse->next[i]; continue; } *e = reuse->next[i]; } /* * Second, discard any overflow record with a just-added flag, clear the * flags for the next run. */ decr = 0; for (e = &head[0]; (reuse = *e) != NULL;) { if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) { F_CLR(reuse, WT_OVFL_REUSE_INUSE); e = &reuse->next[0]; continue; } *e = reuse->next[0]; if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET( __ovfl_reuse_verbose(session, page, reuse, "free")); WT_TRET(bm->free( bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size)); decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE); __wt_free(session, reuse); } if (decr != 0) __wt_cache_page_inmem_decr(session, page, decr); return (0); }
/* * __ovfl_txnc_wrapup -- * Resolve the page's transaction-cache list. */ static int __ovfl_txnc_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_OVFL_TXNC **e, **head, *txnc; size_t decr; int i; head = page->modify->ovfl_track->ovfl_txnc; /* * Discard any transaction-cache records with transaction IDs earlier * than any in the system. * * First, walk the overflow transaction-cache skip lists (except for * the lowest level), fixing up links. */ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) for (e = &head[i]; *e != NULL;) { if (!__wt_txn_visible_all(session, (*e)->current)) { e = &(*e)->next[i]; continue; } *e = (*e)->next[i]; } /* Second, discard any no longer needed transaction-cache records. */ decr = 0; for (e = &head[0]; (txnc = *e) != NULL;) { if (!__wt_txn_visible_all(session, txnc->current)) { e = &(*e)->next[0]; continue; } *e = (*e)->next[0]; decr += WT_OVFL_SIZE(WT_OVFL_TXNC) + txnc->addr_size + txnc->value_size; if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET( __ovfl_txnc_verbose(session, page, txnc, "free")); __wt_free(session, txnc); } if (decr != 0) __wt_cache_page_inmem_decr(session, page, decr); return (0); }
/* * __wt_ovfl_discard_add -- * Add a new entry to the page's list of overflow records that have been * discarded. */ int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell) { WT_OVFL_TRACK *track; if (page->modify->ovfl_track == NULL) WT_RET(__ovfl_track_init(session, page)); track = page->modify->ovfl_track; WT_RET(__wt_realloc_def(session, &track->discard_allocated, track->discard_entries + 1, &track->discard)); track->discard[track->discard_entries++] = cell; if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET(__ovfl_discard_verbose(session, page, cell, "add")); return (0); }
/* * __ovfl_discard_wrapup -- * Resolve the page's overflow discard list after a page is written. */ static int __ovfl_discard_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_CELL **cellp; WT_DECL_RET; WT_OVFL_TRACK *track; uint32_t i; track = page->modify->ovfl_track; for (i = 0, cellp = track->discard; i < track->discard_entries; ++i, ++cellp) { if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET(__ovfl_discard_verbose( session, page, *cellp, "free")); /* Discard each cell's overflow item. */ WT_RET(__wt_ovfl_discard(session, *cellp)); } __wt_free(session, track->discard); track->discard_entries = track->discard_allocated = 0; return (ret); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { struct timespec end, start; WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_snap_min; uint32_t flags; conn = S2C(session); btree = S2BT(session); walk = NULL; txn = &session->txn; saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min; flags = WT_READ_CACHE | WT_READ_NO_GEN; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) WT_RET(__wt_epoch(session, &start)); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } /* * Save the oldest transaction ID we need to keep around. * Otherwise, in a busy system, we could be updating pages so * fast that write leaves never catches up. We deliberately * have no transaction running at this point that would keep * the oldest ID from moving forwards as we walk the tree. */ oldest_id = __wt_txn_oldest_id(session); flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write hot pages (defined as pages that have * been updated since the write phase leaves started): * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; case WT_SYNC_CHECKPOINT: /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make * schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. * We don't update the snapshot in between pages because (a) * the metadata shouldn't be that big, and (b) if we do ever */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * In the final checkpoint pass, child pages cannot be evicted * from underneath internal pages nor can underlying blocks be * freed until the checkpoint's block lists are stable. Also, * we cannot split child pages into parents unless we know the * final pass will write a consistent view of that namespace. * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE); WT_ERR(__wt_evict_file_exclusive_on(session)); __wt_evict_file_exclusive_off(session); WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING); /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* Skip clean pages. */ if (!__wt_page_is_modified(walk->page)) continue; /* * Take a local reference to the page modify structure * now that we know the page is dirty. It needs to be * done in this order otherwise the page modify * structure could have been created between taking the * reference and checking modified. */ page = walk->page; mod = page->modify; /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. * * We can skip dirty pages if: * (1) they are leaf pages; * (2) there is a snapshot transaction active (which * is the case in ordinary application checkpoints * but not all internal cases); and * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. * * Mark the tree dirty: the checkpoint marked it clean * and we can't skip future checkpoints until this page * is written. */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { __wt_page_modify_set(session, page); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: WT_ILLEGAL_VALUE_ERR(session); } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_ERR(__wt_epoch(session, &end)); WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 " bytes, %" PRIu64 " pages of internal\n\t" "Took: %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, WT_TIMEDIFF_MS(end, start))); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); /* * If we got a snapshot in order to write pages, and there was no * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && saved_snap_min == WT_TXN_NONE) __wt_txn_release_snapshot(session); if (btree->checkpointing != WT_CKPT_OFF) { /* * Update the checkpoint generation for this handle so visible * updates newer than the checkpoint can be evicted. * * This has to be published before eviction is enabled again, * so that eviction knows that the checkpoint has completed. */ WT_PUBLISH(btree->checkpoint_gen, conn->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. */ btree->checkpointing = WT_CKPT_OFF; WT_FULL_BARRIER(); /* * If this tree was being skipped by the eviction server during * the checkpoint, clear the wait. */ btree->evict_walk_period = 0; /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ WT_TRET(__wt_evict_server_wake(session)); } __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, true)); return (ret); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, int syncop) { struct timespec end, start; WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, leaf_bytes; uint64_t internal_pages, leaf_pages; uint32_t flags; bool evict_reset; btree = S2BT(session); flags = WT_READ_CACHE | WT_READ_NO_GEN; walk = NULL; txn = &session->txn; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) WT_RET(__wt_epoch(session, &start)); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write the hottest pages: checkpoint will have * to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && __wt_txn_visible_all( session, page->modify->update_txn)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; case WT_SYNC_CHECKPOINT: /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * When internal pages are being reconciled by checkpoint their * child pages cannot disappear from underneath them or be split * into them, nor can underlying blocks be freed until the block * lists for the checkpoint are stable. Set the checkpointing * flag to block eviction of dirty pages until the checkpoint's * internal page pass is complete, then wait for any existing * eviction to complete. */ btree->checkpointing = 1; WT_FULL_BARRIER(); WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); if (evict_reset) __wt_evict_file_exclusive_off(session); /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { /* * If we have a page, and it was ever modified, track * the highest transaction ID in the tree. We do this * here because we want the value after reconciling * dirty pages. */ if (walk != NULL && walk->page != NULL && (mod = walk->page->modify) != NULL && WT_TXNID_LT(btree->rec_max_txn, mod->rec_max_txn)) btree->rec_max_txn = mod->rec_max_txn; WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); if (walk == NULL) break; page = walk->page; mod = page->modify; /* Skip clean pages. */ if (!__wt_page_is_modified(page)) continue; /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. * * We can skip dirty pages if: * (1) they are leaf pages; * (2) there is a snapshot transaction active (which * is the case in ordinary application checkpoints * but not all internal cases); and * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. * * Mark the tree dirty: the checkpoint marked it clean * and we can't skip future checkpoints until this page * is written. */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn) && mod->rec_result != WT_PM_REC_REWRITE) { __wt_page_modify_set(session, page); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } break; } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_ERR(__wt_epoch(session, &end)); WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 " bytes, %" PRIu64 " pages of internal\n\t" "Took: %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, WT_TIMEDIFF(end, start) / WT_MILLION)); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); if (txn->isolation == WT_ISO_READ_COMMITTED && session->ncursors == 0) __wt_txn_release_snapshot(session); if (btree->checkpointing) { /* * Update the checkpoint generation for this handle so visible * updates newer than the checkpoint can be evicted. * * This has to be published before eviction is enabled again, * so that eviction knows that the checkpoint has completed. */ WT_PUBLISH(btree->checkpoint_gen, S2C(session)->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. */ btree->checkpointing = 0; WT_FULL_BARRIER(); /* * If this tree was being skipped by the eviction server during * the checkpoint, clear the wait. */ btree->evict_walk_period = 0; /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ WT_TRET(__wt_evict_server_wake(session)); } __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES) WT_RET(btree->bm->sync(btree->bm, session, true)); return (ret); }
/* * __open_verbose -- * Optionally output a verbose message on handle open. */ static inline int __open_verbose( WT_SESSION_IMPL *session, const char *name, int file_type, u_int flags) { #ifdef HAVE_VERBOSE WT_DECL_RET; WT_DECL_ITEM(tmp); const char *file_type_tag, *sep; if (!WT_VERBOSE_ISSET(session, WT_VERB_FILEOPS)) return (0); /* * It's useful to track file opens when debugging platforms, take some * effort to output good tracking information. */ switch (file_type) { case WT_FS_OPEN_FILE_TYPE_CHECKPOINT: file_type_tag = "checkpoint"; break; case WT_FS_OPEN_FILE_TYPE_DATA: file_type_tag = "data"; break; case WT_FS_OPEN_FILE_TYPE_DIRECTORY: file_type_tag = "directory"; break; case WT_FS_OPEN_FILE_TYPE_LOG: file_type_tag = "log"; break; case WT_FS_OPEN_FILE_TYPE_REGULAR: file_type_tag = "regular"; break; default: file_type_tag = "unknown open type"; break; } WT_RET(__wt_scr_alloc(session, 0, &tmp)); sep = " ("; #define WT_FS_OPEN_VERBOSE_FLAG(f, name) \ if (LF_ISSET(f)) { \ WT_ERR(__wt_buf_catfmt( \ session, tmp, "%s%s", sep, name)); \ sep = ", "; \ } WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_CREATE, "create"); WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_DIRECTIO, "direct-IO"); WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_EXCLUSIVE, "exclusive"); WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_FIXED, "fixed"); WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_READONLY, "readonly"); if (tmp->size != 0) WT_ERR(__wt_buf_catfmt(session, tmp, ")")); __wt_verbose(session, WT_VERB_FILEOPS, "%s: file-open: type %s%s", name, file_type_tag, tmp->size == 0 ? "" : (char *)tmp->data); err: __wt_scr_free(session, &tmp); return (ret); #else WT_UNUSED(session); WT_UNUSED(name); WT_UNUSED(file_type); WT_UNUSED(flags); return (0); #endif }
/* * __wt_txn_update_oldest -- * Sweep the running transactions to update the oldest ID required. * !!! * If a data-source is calling the WT_EXTENSION_API.transaction_oldest * method (for the oldest transaction ID not yet visible to a running * transaction), and then comparing that oldest ID against committed * transactions to see if updates for a committed transaction are still * visible to running transactions, the oldest transaction ID may be * the same as the last committed transaction ID, if the transaction * state wasn't refreshed after the last transaction committed. Push * past the last committed transaction. */ void __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) { WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; uint64_t current_id, id, last_running, oldest_id, prev_oldest_id; uint32_t i, session_cnt; int32_t count; int last_running_moved; conn = S2C(session); txn_global = &conn->txn_global; current_id = last_running = txn_global->current; oldest_session = NULL; prev_oldest_id = txn_global->oldest_id; /* * For pure read-only workloads, or if the update isn't forced and the * oldest ID isn't too far behind, avoid scanning. */ if (prev_oldest_id == current_id || (!force && WT_TXNID_LT(current_id, prev_oldest_id + 100))) return; /* * We're going to scan. Increment the count of scanners to prevent the * oldest ID from moving forwards. Spin if the count is negative, * which indicates that some thread is moving the oldest ID forwards. */ do { if ((count = txn_global->scan_count) < 0) WT_PAUSE(); } while (count < 0 || !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1)); /* The oldest ID cannot change until the scan count goes to zero. */ prev_oldest_id = txn_global->oldest_id; current_id = oldest_id = last_running = txn_global->current; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Update the oldest ID. * * Ignore: IDs older than the oldest ID we saw. This can happen * if we race with a thread that is allocating an ID -- the ID * will not be used because the thread will keep spinning until * it gets a valid one. */ if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && WT_TXNID_LT(id, last_running)) last_running = id; /* * !!! * Note: Don't ignore snap_min values older than the previous * oldest ID. Read-uncommitted operations publish snap_min * values without incrementing scan_count to protect the global * table. See the comment in __wt_txn_cursor_op for * more details. */ if ((id = s->snap_min) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) { oldest_id = id; oldest_session = &conn->sessions[i]; } } if (WT_TXNID_LT(last_running, oldest_id)) oldest_id = last_running; /* The oldest ID can't move past any named snapshots. */ if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; /* Update the last running ID. */ last_running_moved = WT_TXNID_LT(txn_global->last_running, last_running); /* Update the oldest ID. */ if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) && WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) { WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, last_running)) last_running = id; if ((id = s->snap_min) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; } if (WT_TXNID_LT(last_running, oldest_id)) oldest_id = last_running; #ifdef HAVE_DIAGNOSTIC /* * Make sure the ID doesn't move past any named snapshots. * * Don't include the read/assignment in the assert statement. * Coverity complains if there are assignments only done in * diagnostic builds, and when the read is from a volatile. */ id = txn_global->nsnap_oldest_id; WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); #endif if (WT_TXNID_LT(txn_global->last_running, last_running)) txn_global->last_running = last_running; if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; WT_ASSERT(session, txn_global->scan_count == -1); txn_global->scan_count = 0; } else { if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && current_id - oldest_id > 10000 && last_running_moved && oldest_session != NULL) { (void)__wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %d [%s]" " with snap_min %" PRIu64 "\n", oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min); } WT_ASSERT(session, txn_global->scan_count > 0); (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); } }
/* * __wt_rec_track -- * Add an object to the page's list of tracked objects. */ int __wt_rec_track(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, uint32_t addr_size, const void *data, uint32_t data_size, uint32_t flags) { WT_PAGE_MODIFY *mod; WT_PAGE_TRACK *empty, *track; uint8_t *p; uint32_t i; mod = page->modify; /* Find an empty slot. */ empty = NULL; for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i) if (!F_ISSET(track, WT_TRK_OBJECT)) { empty = track; break; } /* Reallocate space as necessary. */ if (empty == NULL) { WT_RET(__rec_track_extend(session, page)); empty = &mod->track[mod->track_entries - 1]; } track = empty; /* * Minor optimization: allocate a single chunk of space instead of two * separate ones: be careful when it's freed. */ WT_RET(__wt_calloc_def(session, addr_size + data_size, &p)); /* * Set the just-added flag so we clean up should reconciliation fail, * except for cached overflow values, which don't get discarded, even * if reconciliation fails. */ track->flags = (uint8_t)flags | WT_TRK_OBJECT; if (!F_ISSET(track, WT_TRK_OVFL_VALUE)) F_SET(track, WT_TRK_JUST_ADDED); track->addr.addr = p; track->addr.size = addr_size; memcpy(track->addr.addr, addr, addr_size); if (data_size) { p += addr_size; track->data = p; track->size = data_size; memcpy(track->data, data, data_size); } /* * Overflow items are potentially large and on-page items remain in the * tracking list until the page is evicted. If we're tracking a lot of * them, their memory might matter: increment the page and cache memory * totals. This is unlikely to matter, but it's inexpensive (unless * there are lots of them, in with case I guess the memory matters). * * If this reconciliation were to fail, we would reasonably perform the * inverse operation in __wt_rec_track_wrapup_err. I'm not bothering * with that because we'd have to crack the structure itself to figure * out how much to decrement and I don't think it's worth the effort. * The potential problem is repeatedly failing reconciliation of a page * with a large number of overflow items, which causes the page's memory * memory footprint to become incorrectly high, causing us to push the * page out of cache unnecessarily. Like I said, not worth the effort. */ if (LF_ISSET(WT_TRK_ONPAGE)) __wt_cache_page_inmem_incr( session, page, addr_size + data_size); if (WT_VERBOSE_ISSET(session, reconcile)) WT_RET(__track_msg(session, page, "add", track)); return (0); }
/* * __wt_lsm_merge -- * Merge a set of chunks of an LSM tree. */ int __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) { WT_BLOOM *bloom; WT_CURSOR *dest, *src; WT_DECL_RET; WT_ITEM key, value; WT_LSM_CHUNK *chunk; uint32_t generation; uint64_t insert_count, record_count; u_int dest_id, end_chunk, i, nchunks, start_chunk, start_id, verb; int tret; bool created_chunk, create_bloom, locked, in_sync; const char *cfg[3]; const char *drop_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL }; bloom = NULL; chunk = NULL; dest = src = NULL; start_id = 0; created_chunk = create_bloom = locked = in_sync = false; /* Fast path if it's obvious no merges could be done. */ if (lsm_tree->nchunks < lsm_tree->merge_min && lsm_tree->merge_aggressiveness < WT_LSM_AGGRESSIVE_THRESHOLD) return (WT_NOTFOUND); /* * Use the lsm_tree lock to read the chunks (so no switches occur), but * avoid holding it while the merge is in progress: that may take a * long time. */ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; WT_ERR(__lsm_merge_span(session, lsm_tree, id, &start_chunk, &end_chunk, &record_count)); nchunks = (end_chunk + 1) - start_chunk; WT_ASSERT(session, nchunks > 0); start_id = lsm_tree->chunk[start_chunk]->id; /* Find the merge generation. */ for (generation = 0, i = 0; i < nchunks; i++) generation = WT_MAX(generation, lsm_tree->chunk[start_chunk + i]->generation + 1); WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); locked = false; /* Allocate an ID for the merge. */ dest_id = __wt_atomic_add32(&lsm_tree->last, 1); /* * We only want to do the chunk loop if we're running with verbose, * so we wrap these statements in the conditional. Avoid the loop * in the normal path. */ if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Merging %s chunks %u-%u into %u (%" PRIu64 " records)" ", generation %" PRIu32, lsm_tree->name, start_chunk, end_chunk, dest_id, record_count, generation)); for (verb = start_chunk; verb <= end_chunk; verb++) WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Merging %s: Chunk[%u] id %u, gen: %" PRIu32 ", size: %" PRIu64 ", records: %" PRIu64, lsm_tree->name, verb, lsm_tree->chunk[verb]->id, lsm_tree->chunk[verb]->generation, lsm_tree->chunk[verb]->size, lsm_tree->chunk[verb]->count)); } WT_ERR(__wt_calloc_one(session, &chunk)); created_chunk = true; chunk->id = dest_id; if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) && (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) || start_chunk > 0) && record_count > 0) create_bloom = true; /* * Special setup for the merge cursor: * first, reset to open the dependent cursors; * then restrict the cursor to a specific number of chunks; * then set MERGE so the cursor doesn't track updates to the tree. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(ret); if (create_bloom) { WT_ERR(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk)); WT_ERR(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, record_count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); } /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); cfg[1] = "bulk,raw,skip_sort_check"; cfg[2] = NULL; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); #define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) WT_ERR(EINTR); WT_STAT_FAST_CONN_INCRV(session, lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; } WT_ERR(src->get_key(src, &key)); dest->set_key(dest, &key); WT_ERR(src->get_value(src, &value)); dest->set_value(dest, &value); WT_ERR(dest->insert(dest)); if (create_bloom) WT_ERR(__wt_bloom_insert(bloom, &key)); } WT_ERR_NOTFOUND_OK(ret); WT_STAT_FAST_CONN_INCRV(session, lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.", record_count, insert_count)); /* * Closing and syncing the files can take a while. Set the * merge_syncing field so that compact knows it is still in * progress. */ (void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1); in_sync = true; /* * We've successfully created the new chunk. Now install it. We need * to ensure that the NO_CACHE flag is cleared and the bloom filter * is closed (even if a step fails), so track errors but don't return * until we've cleaned up. */ WT_TRET(src->close(src)); WT_TRET(dest->close(dest)); src = dest = NULL; F_CLR(session, WT_SESSION_NO_CACHE); /* * We're doing advisory reads to fault the new trees into cache. * Don't block if the cache is full: our next unit of work may be to * discard some trees to free space. */ F_SET(session, WT_SESSION_NO_EVICTION); if (create_bloom) { if (ret == 0) WT_TRET(__wt_bloom_finalize(bloom)); /* * Read in a key to make sure the Bloom filters btree handle is * open before it becomes visible to application threads. * Otherwise application threads will stall while it is opened * and internal pages are read into cache. */ if (ret == 0) { WT_CLEAR(key); WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); } WT_TRET(__wt_bloom_close(bloom)); bloom = NULL; } WT_ERR(ret); /* * Open a handle on the new chunk before application threads attempt * to access it, opening it pre-loads internal pages into the file * system cache. */ cfg[1] = "checkpoint=" WT_CHECKPOINT; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); WT_TRET(dest->close(dest)); dest = NULL; ++lsm_tree->merge_progressing; (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); in_sync = false; WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; /* * Check whether we raced with another merge, and adjust the chunk * array offset as necessary. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) if (lsm_tree->chunk[start_chunk]->id == start_id) break; /* * It is safe to error out here - since the update can only fail * prior to making updates to the tree. */ WT_ERR(__wt_lsm_merge_update_tree( session, lsm_tree, start_chunk, nchunks, chunk)); if (create_bloom) F_SET(chunk, WT_LSM_CHUNK_BLOOM); chunk->count = insert_count; chunk->generation = generation; F_SET(chunk, WT_LSM_CHUNK_ONDISK); /* * We have no current way of continuing if the metadata update fails, * so we will panic in that case. Put some effort into cleaning up * after ourselves here - so things have a chance of shutting down. * * Any errors that happened after the tree was locked are * fatal - we can't guarantee the state of the tree. */ if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0) WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge"); lsm_tree->dsk_gen++; /* Update the throttling while holding the tree lock. */ __wt_lsm_tree_throttle(session, lsm_tree, true); /* Schedule a pass to discard old chunks */ WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_DROP, 0, lsm_tree)); err: if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (in_sync) (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); if (src != NULL) WT_TRET(src->close(src)); if (dest != NULL) WT_TRET(dest->close(dest)); if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); if (ret != 0 && created_chunk) { /* Drop the newly-created files on error. */ if (chunk->uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop(session, chunk->uri, drop_cfg)); WT_TRET(tret); } if (create_bloom && chunk->bloom_uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop( session, chunk->bloom_uri, drop_cfg)); WT_TRET(tret); } __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, chunk); if (ret == EINTR) WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Merge aborted due to close")); else WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Merge failed with %s", __wt_strerror(session, ret, NULL, 0))); } F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); }
/* * __wt_block_checkpoint_load -- * Load a checkpoint. */ int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint) { WT_BLOCK_CKPT *ci, _ci; WT_DECL_ITEM(tmp); WT_DECL_RET; uint8_t *endp; ci = NULL; /* * Sometimes we don't find a root page (we weren't given a checkpoint, * or the checkpoint was empty). In that case we return an empty root * address, set that up now. */ *root_addr_sizep = 0; #ifdef HAVE_VERBOSE if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { if (addr != NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string(session, block, addr, tmp)); } __wt_verbose(session, WT_VERB_CHECKPOINT, "%s: load-checkpoint: %s", block->name, addr == NULL ? "[Empty]" : (const char *)tmp->data); } #endif /* * There's a single checkpoint in the file that can be written, all of * the others are read-only. We use the same initialization calls for * readonly checkpoints, but the information doesn't persist. */ if (checkpoint) { ci = &_ci; WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint")); } else { /* * We depend on the btree level for locking: things will go bad * fast if we open the live system in two handles, or salvage, * truncate or verify the live/running file. */ #ifdef HAVE_DIAGNOSTIC __wt_spin_lock(session, &block->live_lock); WT_ASSERT(session, block->live_open == false); block->live_open = true; __wt_spin_unlock(session, &block->live_lock); #endif ci = &block->live; WT_ERR(__wt_block_ckpt_init(session, ci, "live")); } /* * If the checkpoint has an on-disk root page, load it. Otherwise, size * the file past the description information. */ if (addr == NULL || addr_size == 0) ci->file_size = block->allocsize; else { /* Crack the checkpoint cookie. */ WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci)); /* Verify sets up next. */ if (block->verify) WT_ERR(__wt_verify_ckpt_load(session, block, ci)); /* Read any root page. */ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) { endp = root_addr; WT_ERR(__wt_block_addr_to_buffer(block, &endp, ci->root_offset, ci->root_size, ci->root_checksum)); *root_addr_sizep = WT_PTRDIFF(endp, root_addr); } /* * Rolling a checkpoint forward requires the avail list, the * blocks from which we can allocate. */ if (!checkpoint) WT_ERR(__wt_block_extlist_read_avail( session, block, &ci->avail, ci->file_size)); } /* * If the checkpoint can be written, that means anything written after * the checkpoint is no longer interesting, truncate the file. Don't * bother checking the avail list for a block at the end of the file, * that was done when the checkpoint was first written (re-writing the * checkpoint might possibly make it relevant here, but it's unlikely * enough I don't bother). */ if (!checkpoint) WT_ERR(__wt_block_truncate(session, block, ci->file_size)); if (0) { err: /* * Don't call checkpoint-unload: unload does real work including * file truncation. If we fail early enough that the checkpoint * information isn't correct, bad things would happen. The only * allocated memory was in the service of verify, clean that up. */ if (block->verify) WT_TRET(__wt_verify_ckpt_unload(session, block)); } /* Checkpoints don't need the original information, discard it. */ if (checkpoint && ci != NULL) __wt_block_ckpt_destroy(session, ci); __wt_scr_free(session, &tmp); return (ret); }
/* * __ckpt_process -- * Process the list of checkpoints. */ static int __ckpt_process( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) { WT_BLOCK_CKPT *a, *b, *ci; WT_CKPT *ckpt, *next_ckpt; WT_DECL_ITEM(tmp); WT_DECL_RET; uint64_t ckpt_size; int deleting, locked; ci = &block->live; locked = 0; /* * We've allocated our last page, update the checkpoint size. We need * to calculate the live system's checkpoint size before reading and * merging checkpoint allocation and discard information from the * checkpoints we're deleting, those operations change the underlying * byte counts. */ ckpt_size = ci->ckpt_size; ckpt_size += ci->alloc.bytes; ckpt_size -= ci->discard.bytes; /* * Extents newly available as a result of deleting previous checkpoints * are added to a list of extents. The list should be empty, but there * is no explicit "free the checkpoint information" call into the block * manager; if there was an error in an upper level resulting in some * previous checkpoint never being resolved, the list may not be empty. * * XXX * This isn't sufficient, actually: we're going to leak all the blocks * written as part of the last checkpoint because it was never resolved. */ __wt_block_extlist_free(session, &ci->ckpt_avail); WT_RET(__wt_block_extlist_init( session, &ci->ckpt_avail, "live", "ckpt_avail")); /* * To delete a checkpoint, we'll need checkpoint information for it and * the subsequent checkpoint into which it gets rolled; read them from * disk before we lock things down. */ deleting = 0; WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; deleting = 1; /* * Read the checkpoint and next checkpoint extent lists if we * haven't already read them (we may have already read these * extent blocks if there is more than one deleted checkpoint). */ if (ckpt->bpriv == NULL) WT_ERR(__ckpt_extlist_read(session, block, ckpt)); for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * The "next" checkpoint may be the live tree which has no * extent blocks to read. */ if (next_ckpt->bpriv == NULL && !F_ISSET(next_ckpt, WT_CKPT_ADD)) WT_ERR(__ckpt_extlist_read(session, block, next_ckpt)); } /* * Hold a lock so the live extent lists and the file size can't change * underneath us. I suspect we'll tighten this if checkpoints take too * much time away from real work: we read the historic checkpoint * information without a lock, but we could also merge and re-write the * delete checkpoint information without a lock, except for ranges * merged into the live tree. */ __wt_spin_lock(session, &block->live_lock); locked = 1; /* Skip the additional processing if we aren't deleting checkpoints. */ if (!deleting) goto live_update; /* * Delete any no-longer-needed checkpoints: we do this first as it frees * blocks to the live lists, and the freed blocks will then be included * when writing the live extent lists. */ WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; if (WT_VERBOSE_ISSET(session, ckpt)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string( session, block, ckpt->raw.data, tmp)); WT_VERBOSE_ERR(session, ckpt, "%s: delete-checkpoint: %s: %s", block->name, ckpt->name, (char *)tmp->data); } /* * Find the checkpoint into which we'll roll this checkpoint's * blocks: it's the next real checkpoint in the list, and it * better have been read in (if it's not the add slot). */ for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * Set the from/to checkpoint structures, where the "to" value * may be the live tree. */ a = ckpt->bpriv; if (F_ISSET(next_ckpt, WT_CKPT_ADD)) b = &block->live; else b = next_ckpt->bpriv; /* * Free the root page: there's nothing special about this free, * the root page is allocated using normal rules, that is, it * may have been taken from the avail list, and was entered on * the live system's alloc list at that time. We free it into * the checkpoint's discard list, however, not the live system's * list because it appears on the checkpoint's alloc list and so * must be paired in the checkpoint. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) WT_ERR(__wt_block_insert_ext(session, &a->discard, a->root_offset, a->root_size)); /* * Free the blocks used to hold the "from" checkpoint's extent * lists, including the avail list. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard)); /* * Roll the "from" alloc and discard extent lists into the "to" * checkpoint's lists. */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( session, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( session, &a->discard, &b->discard)); /* * If the "to" checkpoint is also being deleted, we're done with * it, it's merged into some other checkpoint in the next loop. * This means the extent lists may aggregate over a number of * checkpoints, but that's OK, they're disjoint sets of ranges. */ if (F_ISSET(next_ckpt, WT_CKPT_DELETE)) continue; /* * Find blocks for re-use: wherever the "to" checkpoint's * allocate and discard lists overlap, move the range to * the live system's checkpoint available list. */ WT_ERR(__wt_block_extlist_overlap(session, block, b)); /* * If we're updating the live system's information, we're done. */ if (F_ISSET(next_ckpt, WT_CKPT_ADD)) continue; /* * We have to write the "to" checkpoint's extent lists out in * new blocks, and update its cookie. * * Free the blocks used to hold the "to" checkpoint's extent * lists; don't include the avail list, it's not changing. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard)); F_SET(next_ckpt, WT_CKPT_UPDATE); } /* Update checkpoints marked for update. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_UPDATE)) { WT_ASSERT(session, !F_ISSET(ckpt, WT_CKPT_ADD)); WT_ERR(__ckpt_update( session, block, ckpt, ckpt->bpriv, 0, 0)); } live_update: ci = &block->live; /* Truncate the file if that's possible. */ WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail)); /* Update the final, added checkpoint based on the live system. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_ADD)) { WT_ERR(__ckpt_update( session, block, ckpt, ci, ckpt_size, 1)); /* * XXX * Our caller wants the final checkpoint size. Setting * the size here violates layering, but the alternative * is a call for the btree layer to crack the checkpoint * cookie into its components, and that's a fair amount * of work. */ ckpt->ckpt_size = ci->ckpt_size; } /* * Reset the live system's alloc and discard extent lists, leave the * avail list alone. */ __wt_block_extlist_free(session, &ci->alloc); WT_ERR(__wt_block_extlist_init(session, &ci->alloc, "live", "alloc")); __wt_block_extlist_free(session, &ci->discard); WT_ERR( __wt_block_extlist_init(session, &ci->discard, "live", "discard")); #ifdef HAVE_DIAGNOSTIC /* * The first checkpoint in the system should always have an empty * discard list. If we've read that checkpoint and/or created it, * check. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (!F_ISSET(ckpt, WT_CKPT_DELETE)) break; if ((a = ckpt->bpriv) == NULL) a = &block->live; if (a->discard.entries != 0) { __wt_errx(session, "first checkpoint incorrectly has blocks on the discard " "list"); WT_ERR(WT_ERROR); } #endif err: if (locked) __wt_spin_unlock(session, &block->live_lock); /* Discard any checkpoint information we loaded. */ WT_CKPT_FOREACH(ckptbase, ckpt) if ((ci = ckpt->bpriv) != NULL) __wt_block_ckpt_destroy(session, ci); __wt_scr_free(&tmp); return (ret); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *prev, *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_pinned_id, time_start, time_stop; uint32_t flags; bool timer, tried_eviction; conn = S2C(session); btree = S2BT(session); prev = walk = NULL; txn = &session->txn; tried_eviction = false; time_start = time_stop = 0; /* Only visit pages in cache and don't bump page read generations. */ flags = WT_READ_CACHE | WT_READ_NO_GEN; /* * Skip all deleted pages. For a page to be marked deleted, it must * have been evicted from cache and marked clean. Checkpoint should * never instantiate deleted pages: if a truncate is not visible to the * checkpoint, the on-disk version is correct. If the truncate is * visible, we skip over the child page when writing its parent. We * check whether a truncate is visible in the checkpoint as part of * reconciling internal pages (specifically in __rec_child_modify). */ LF_SET(WT_READ_DELETED_SKIP); internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT); if (timer) time_start = __wt_clock(session); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } /* * Save the oldest transaction ID we need to keep around. * Otherwise, in a busy system, we could be updating pages so * fast that write leaves never catches up. We deliberately * have no transaction running at this point that would keep * the oldest ID from moving forwards as we walk the tree. */ oldest_id = __wt_txn_oldest_id(session); LF_SET(WT_READ_NO_WAIT | WT_READ_SKIP_INTL); for (;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write hot pages (defined as pages that have * been updated since the write phase leaves started): * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, WT_REC_CHECKPOINT, NULL)); } } break; case WT_SYNC_CHECKPOINT: /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make * a schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. * We don't update the snapshot in between pages because the * metadata shouldn't have many pages. Instead, read-committed * isolation ensures that all metadata updates completed before * the checkpoint are included. */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * In the final checkpoint pass, child pages cannot be evicted * from underneath internal pages nor can underlying blocks be * freed until the checkpoint's block lists are stable. Also, * we cannot split child pages into parents unless we know the * final pass will write a consistent view of that namespace. * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ WT_ASSERT(session, btree->syncing == WT_BTREE_SYNC_OFF && btree->sync_session == NULL); btree->sync_session = session; btree->syncing = WT_BTREE_SYNC_WAIT; (void)__wt_gen_next_drain(session, WT_GEN_EVICT); btree->syncing = WT_BTREE_SYNC_RUNNING; /* Write all dirty in-cache pages. */ LF_SET(WT_READ_NO_EVICT); /* Read pages with lookaside entries and evict them asap. */ LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED); for (;;) { WT_ERR(__sync_dup_walk(session, walk, flags, &prev)); WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Skip clean pages, but need to make sure maximum * transaction ID is always updated. */ if (!__wt_page_is_modified(walk->page)) { if (((mod = walk->page->modify) != NULL) && mod->rec_max_txn > btree->rec_max_txn) btree->rec_max_txn = mod->rec_max_txn; if (mod != NULL && btree->rec_max_timestamp < mod->rec_max_timestamp) btree->rec_max_timestamp = mod->rec_max_timestamp; continue; } /* * Take a local reference to the page modify structure * now that we know the page is dirty. It needs to be * done in this order otherwise the page modify * structure could have been created between taking the * reference and checking modified. */ page = walk->page; /* * Write dirty pages, if we can't skip them. If we skip * a page, mark the tree dirty. The checkpoint marked it * clean and we can't skip future checkpoints until this * page is written. */ if (__sync_checkpoint_can_skip(session, page)) { __wt_tree_modify_set(session); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } /* * If the page was pulled into cache by our read, try * to evict it now. * * For eviction to have a chance, we first need to move * the walk point to the next page checkpoint will * visit. We want to avoid this code being too special * purpose, so try to reuse the ordinary eviction path. * * Regardless of whether eviction succeeds or fails, * the walk continues from the previous location. We * remember whether we tried eviction, and don't try * again. Even if eviction fails (the page may stay in * cache clean but with history that cannot be * discarded), that is not wasted effort because * checkpoint doesn't need to write the page again. */ if (!WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_WONT_NEED && !tried_eviction) { WT_ERR_BUSY_OK( __wt_page_release_evict(session, walk)); walk = prev; prev = NULL; tried_eviction = true; continue; } tried_eviction = false; WT_ERR(__wt_reconcile( session, walk, NULL, WT_REC_CHECKPOINT, NULL)); /* * Update checkpoint IO tracking data if configured * to log verbose progress messages. */ if (conn->ckpt_timer_start.tv_sec > 0) { conn->ckpt_write_bytes += page->memory_footprint; ++conn->ckpt_write_pages; /* Periodically log checkpoint progress. */ if (conn->ckpt_write_pages % 5000 == 0) __wt_checkpoint_progress( session, false); } } break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: WT_ERR(__wt_illegal_value(session, syncop)); break; } if (timer) { time_stop = __wt_clock(session); __wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote: %" PRIu64 " leaf pages (%" PRIu64 "B), %" PRIu64 " internal pages (%" PRIu64 "B), and took %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_pages, leaf_bytes, internal_pages, internal_bytes, WT_CLOCKDIFF_MS(time_stop, time_start)); } err: /* On error, clear any left-over tree walk. */ WT_TRET(__wt_page_release(session, walk, flags)); WT_TRET(__wt_page_release(session, prev, flags)); /* * If we got a snapshot in order to write pages, and there was no * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && saved_pinned_id == WT_TXN_NONE) __wt_txn_release_snapshot(session); /* Clear the checkpoint flag. */ btree->syncing = WT_BTREE_SYNC_OFF; btree->sync_session = NULL; __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, false)); return (ret); }
/* * __snapshot_process -- * Process the list of snapshots. */ static int __snapshot_process( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase) { WT_BLOCK_SNAPSHOT *a, *b, *si; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_SNAPSHOT *snap; uint64_t snapshot_size; int deleting, locked; si = &block->live; locked = 0; /* * We've allocated our last page, update the snapshot size. We need to * calculate the live system's snapshot size before reading and merging * snapshot allocation and discard information from the snapshots we're * deleting, those operations will change the underlying byte counts. */ snapshot_size = si->snapshot_size; snapshot_size += si->alloc.bytes; snapshot_size -= si->discard.bytes; /* * Extents that become newly available as a result of deleting previous * snapshots are added to a list of extents. The list should be empty, * but there's no explicit "free the snapshot information" call into the * block manager; if there was an error in an upper level resulting in * the snapshot never being "resolved", the list might not be empty. * * XXX * This isn't sufficient, actually: we're going to leak all the blocks * that were written as part of the last snapshot because it was never * resolved. */ __wt_block_extlist_free(session, &si->snapshot_avail); WT_RET(__wt_block_extlist_init( session, &si->snapshot_avail, "live", "snapshot_avail")); /* * To delete a snapshot, we'll need snapshot information for it, and we * have to read that from the disk. */ deleting = 0; WT_SNAPSHOT_FOREACH(snapbase, snap) { /* * To delete a snapshot, we'll need snapshot information for it * and the subsequent snapshot. The test is tricky, we have to * load the current snapshot's information if it's marked for * deletion, or if it follows a snapshot marked for deletion, * where the boundary cases are the first snapshot in the list * and the last snapshot in the list: if we're deleting the last * snapshot in the list, there's no next snapshot, the snapshot * will be merged into the live tree. */ if (!F_ISSET(snap, WT_SNAP_DELETE) && (snap == snapbase || F_ISSET(snap, WT_SNAP_ADD) || !F_ISSET(snap - 1, WT_SNAP_DELETE))) continue; deleting = 1; /* * Allocate a snapshot structure, crack the cookie and read the * snapshot's extent lists. * * Ignore the avail list: snapshot avail lists are only useful * if we are rolling forward from the particular snapshot and * they represent our best understanding of what blocks can be * allocated. If we are not operating on the live snapshot, * subsequent snapshots might have allocated those blocks, and * the avail list is useless. We don't discard it, because it * is useful as part of verification, but we don't re-write it * either. */ WT_ERR(__wt_calloc( session, 1, sizeof(WT_BLOCK_SNAPSHOT), &snap->bpriv)); si = snap->bpriv; WT_ERR(__wt_block_snap_init(session, block, si, snap->name, 0)); WT_ERR(__wt_block_buffer_to_snapshot( session, block, snap->raw.data, si)); WT_ERR(__wt_block_extlist_read(session, block, &si->alloc)); WT_ERR(__wt_block_extlist_read(session, block, &si->discard)); } /* * Hold a lock so the live extent lists and the file size can't change * underneath us. I suspect we'll tighten this if snapshots take too * much time away from real work: we read historic snapshot information * without a lock, but we could also merge and re-write the delete * snapshot information without a lock, except for ranges merged into * the live tree. */ __wt_spin_lock(session, &block->live_lock); locked = 1; /* Skip the additional processing if we aren't deleting snapshots. */ if (!deleting) goto live_update; /* * Delete any no-longer-needed snapshots: we do this first as it frees * blocks to the live lists, and the freed blocks will then be included * when writing the live extent lists. */ WT_SNAPSHOT_FOREACH(snapbase, snap) { if (!F_ISSET(snap, WT_SNAP_DELETE)) continue; if (WT_VERBOSE_ISSET(session, snapshot)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string( session, block, snap->raw.data, tmp)); WT_VERBOSE_ERR(session, snapshot, "%s: delete-snapshot: %s: %s", block->name, snap->name, (char *)tmp->data); } /* * Set the from/to snapshot structures, where the "to" value * may be the live tree. */ a = snap->bpriv; if (F_ISSET(snap + 1, WT_SNAP_ADD)) b = &block->live; else b = (snap + 1)->bpriv; /* * Free the root page: there's nothing special about this free, * the root page is allocated using normal rules, that is, it * may have been taken from the avail list, and was entered on * the live system's alloc list at that time. We free it into * the snapshot's discard list, however, not the live system's * list because it appears on the snapshot's alloc list and so * must be paired in the snapshot. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) WT_ERR(__wt_block_insert_ext(session, &a->discard, a->root_offset, a->root_size)); /* * Free the blocks used to hold the "from" snapshot's extent * lists directly to the live system's avail list, they were * never on any alloc list. Include the "from" snapshot's * avail list, it's going away. */ WT_ERR(__snapshot_extlist_fblocks(session, block, &a->alloc)); WT_ERR(__snapshot_extlist_fblocks(session, block, &a->avail)); WT_ERR(__snapshot_extlist_fblocks(session, block, &a->discard)); /* * Roll the "from" alloc and discard extent lists into the "to" * snapshot's lists. */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( session, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( session, &a->discard, &b->discard)); /* * If the "to" snapshot is also being deleted, we're done with * it, it's merged into some other snapshot in the next loop. * This means the extent lists may aggregate over a number of * snapshots, but that's OK, they're disjoint sets of ranges. */ if (F_ISSET(snap + 1, WT_SNAP_DELETE)) continue; /* * Find blocks for re-use: wherever the "to" snapshot's allocate * and discard lists overlap is fair game, move ranges appearing * on both lists to the live snapshot's newly available list. */ WT_ERR(__wt_block_extlist_overlap(session, block, b)); /* * If we're updating the live system's information, we're done. */ if (F_ISSET(snap + 1, WT_SNAP_ADD)) continue; /* * We have to write the "to" snapshot's extent lists out in new * blocks, and update its cookie. * * Free the blocks used to hold the "to" snapshot's extent lists * directly to the live system's avail list, they were never on * any alloc list. Do not include the "to" snapshot's avail * list, it's not changing. */ WT_ERR(__snapshot_extlist_fblocks(session, block, &b->alloc)); WT_ERR(__snapshot_extlist_fblocks(session, block, &b->discard)); F_SET(snap + 1, WT_SNAP_UPDATE); } /* Update snapshots marked for update. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (F_ISSET(snap, WT_SNAP_UPDATE)) { WT_ASSERT(session, !F_ISSET(snap, WT_SNAP_ADD)); WT_ERR(__snapshot_update( session, block, snap, snap->bpriv, 0, 0)); } live_update: si = &block->live; /* Truncate the file if that's possible. */ WT_ERR(__wt_block_extlist_truncate(session, block, &si->avail)); /* Update the final, added snapshot based on the live system. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (F_ISSET(snap, WT_SNAP_ADD)) { WT_ERR(__snapshot_update( session, block, snap, si, snapshot_size, 1)); /* * XXX * Our caller wants two pieces of information: the time * the snapshot was taken and the final snapshot size. * This violates layering but the alternative is a call * for the btree layer to crack the snapshot cookie into * its components, and that's a fair amount of work. * (We could just read the system time in the session * layer when updating the metadata file, but that won't * work for the snapshot size, and so we do both here.) */ snap->snapshot_size = si->snapshot_size; WT_ERR(__wt_epoch(session, &snap->sec, NULL)); } /* * Reset the live system's alloc and discard extent lists, leave the * avail list alone. */ __wt_block_extlist_free(session, &si->alloc); WT_ERR(__wt_block_extlist_init(session, &si->alloc, "live", "alloc")); __wt_block_extlist_free(session, &si->discard); WT_ERR( __wt_block_extlist_init(session, &si->discard, "live", "discard")); #ifdef HAVE_DIAGNOSTIC /* * The first snapshot in the system should always have an empty discard * list. If we've read that snapshot and/or created it, check. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (!F_ISSET(snap, WT_SNAP_DELETE)) break; if ((a = snap->bpriv) == NULL) a = &block->live; if (a->discard.entries != 0) { __wt_errx(session, "snapshot incorrectly has blocks on the discard list"); WT_ERR(WT_ERROR); } #endif err: if (locked) __wt_spin_unlock(session, &block->live_lock); /* Discard any snapshot information we loaded, we no longer need it. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if ((si = snap->bpriv) != NULL) { __wt_block_extlist_free(session, &si->alloc); __wt_block_extlist_free(session, &si->avail); __wt_block_extlist_free(session, &si->discard); } __wt_scr_free(&tmp); return (ret); }
/* * __snapshot_update -- * Update a snapshot. */ static int __snapshot_update( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snap, WT_BLOCK_SNAPSHOT *si, uint64_t snapshot_size, int is_live) { WT_DECL_ITEM(tmp); WT_DECL_RET; uint8_t *endp; #ifdef HAVE_DIAGNOSTIC /* Check the extent list combinations for overlaps. */ WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->avail)); WT_RET(__wt_block_extlist_check(session, &si->discard, &si->avail)); WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->discard)); #endif /* * Write the snapshot's extent lists; we only write an avail list for * the live system, other snapshot's avail lists are static and never * change. When we do write the avail list for the live system it's * two lists: the current avail list plus the list of blocks that are * being made available as of the new snapshot. We can't merge that * second list into the real list yet, it's not truly available until * the new snapshot location has been saved to the metadata. */ WT_RET(__wt_block_extlist_write(session, block, &si->alloc, NULL)); if (is_live) WT_RET(__wt_block_extlist_write( session, block, &si->avail, &si->snapshot_avail)); WT_RET(__wt_block_extlist_write(session, block, &si->discard, NULL)); /* * Set the file size for the live system. * * XXX * We do NOT set the file size when re-writing snapshots because we want * to test the snapshot's blocks against a reasonable maximum file size * during verification. This is not good: imagine a snapshot appearing * early in the file, re-written, and then the snapshot requires blocks * at the end of the file, blocks after the listed file size. If the * application opens that snapshot for writing (discarding subsequent * snapshots), we would truncate the file to the early chunk, discarding * the re-written snapshot information. The alternative, updating the * file size has its own problems, in that case we'd work correctly, but * we'd lose all of the blocks between the original snapshot and the * re-written snapshot. Currently, there's no API to roll-forward * intermediate snapshots, if there ever is, this will need to be fixed. */ if (is_live) WT_RET(__wt_filesize(session, block->fh, &si->file_size)); /* Set the snapshot size for the live system. */ if (is_live) si->snapshot_size = snapshot_size; /* * Copy the snapshot information into the snapshot array's address * cookie. */ WT_RET(__wt_buf_init(session, &snap->raw, WT_BTREE_MAX_ADDR_COOKIE)); endp = snap->raw.mem; WT_RET(__wt_block_snapshot_to_buffer(session, block, &endp, si)); snap->raw.size = WT_PTRDIFF32(endp, snap->raw.mem); if (WT_VERBOSE_ISSET(session, snapshot)) { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string(session, block, snap->raw.data, tmp)); WT_VERBOSE_ERR(session, snapshot, "%s: create-snapshot: %s: %s", block->name, snap->name, (char *)tmp->data); } err: __wt_scr_free(&tmp); return (ret); }
/* * __wt_block_snapshot_load -- * Load a snapshot. */ int __wt_block_snapshot_load(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size, int readonly) { WT_BLOCK_SNAPSHOT *si; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_UNUSED(addr_size); /* * Sometimes we don't find a root page (we weren't given a snapshot, * or the referenced snapshot was empty). In that case we return a * root page size of 0. Set that up now. */ dsk->size = 0; si = &block->live; WT_RET(__wt_block_snap_init(session, block, si, "live", 1)); if (WT_VERBOSE_ISSET(session, snapshot)) { if (addr != NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string(session, block, addr, tmp)); } WT_VERBOSE_ERR(session, snapshot, "%s: load-snapshot: %s", block->name, addr == NULL ? "[Empty]" : (char *)tmp->data); } /* If not loading a snapshot from disk, we're done. */ if (addr == NULL || addr_size == 0) return (0); /* Crack the snapshot cookie. */ if (addr != NULL) WT_ERR(__wt_block_buffer_to_snapshot(session, block, addr, si)); /* Verify sets up next. */ if (block->verify) WT_ERR(__wt_verify_snap_load(session, block, si)); /* Read, and optionally verify, any root page. */ if (si->root_offset != WT_BLOCK_INVALID_OFFSET) { WT_ERR(__wt_block_read_off(session, block, dsk, si->root_offset, si->root_size, si->root_cksum)); if (block->verify) { if (tmp == NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string( session, block, addr, tmp)); } WT_ERR( __wt_verify_dsk(session, (char *)tmp->data, dsk)); } } /* * Rolling a snapshot forward requires the avail list, the blocks from * which we can allocate. */ if (!readonly) WT_ERR(__wt_block_extlist_read(session, block, &si->avail)); /* * If the snapshot can be written, that means anything written after * the snapshot is no longer interesting. Truncate the file. */ if (!readonly) { WT_VERBOSE_ERR(session, snapshot, "truncate file to %" PRIuMAX, (uintmax_t)si->file_size); WT_ERR(__wt_ftruncate(session, block->fh, si->file_size)); } if (0) { err: (void)__wt_block_snapshot_unload(session, block); } __wt_scr_free(&tmp); return (ret); }
/* * __ovfl_reuse_wrapup -- * Resolve the page's overflow reuse list after a page is written. */ static int __ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BM *bm; WT_OVFL_REUSE **e, **head, *reuse; size_t decr; int i; bm = S2BT(session)->bm; head = page->modify->ovfl_track->ovfl_reuse; /* * Discard any overflow records that aren't in-use, freeing underlying * blocks. * * First, walk the overflow reuse lists (except for the lowest one), * fixing up skiplist links. */ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) for (e = &head[i]; (reuse = *e) != NULL;) { if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) { e = &reuse->next[i]; continue; } *e = reuse->next[i]; } /* * Second, discard any overflow record without an in-use flag, clear * the flags for the next run. * * As part of the pass through the lowest level, figure out how much * space we added/subtracted from the page, and update its footprint. * We don't get it exactly correct because we don't know the depth of * the skiplist here, but it's close enough, and figuring out the * memory footprint change in the reconciliation wrapup code means * fewer atomic updates and less code overall. */ decr = 0; for (e = &head[0]; (reuse = *e) != NULL;) { if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) { F_CLR(reuse, WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED); e = &reuse->next[0]; continue; } *e = reuse->next[0]; WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)); if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET( __ovfl_reuse_verbose(session, page, reuse, "free")); WT_RET(bm->free( bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size)); decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE); __wt_free(session, reuse); } if (decr != 0) __wt_cache_page_inmem_decr(session, page, decr); return (0); }
/* * __ckpt_process -- * Process the list of checkpoints. */ static int __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) { WT_BLOCK_CKPT *a, *b, *ci; WT_CKPT *ckpt, *next_ckpt; WT_DECL_ITEM(tmp); WT_DECL_RET; uint64_t ckpt_size; bool deleting, fatal, locked; ci = &block->live; fatal = locked = false; #ifdef HAVE_DIAGNOSTIC WT_RET(__ckpt_verify(session, ckptbase)); #endif /* * Checkpoints are a two-step process: first, write a new checkpoint to * disk (including all the new extent lists for modified checkpoints * and the live system). As part of this, create a list of file blocks * newly available for reallocation, based on checkpoints being deleted. * We then return the locations of the new checkpoint information to our * caller. Our caller has to write that information into some kind of * stable storage, and once that's done, we can actually allocate from * that list of newly available file blocks. (We can't allocate from * that list immediately because the allocation might happen before our * caller saves the new checkpoint information, and if we crashed before * the new checkpoint location was saved, we'd have overwritten blocks * still referenced by checkpoints in the system.) In summary, there is * a second step: after our caller saves the checkpoint information, we * are called to add the newly available blocks into the live system's * available list. * * This function is the first step, the second step is in the resolve * function. * * If we're called to checkpoint the same file twice (without the second * resolution step), or re-entered for any reason, it's an error in our * caller, and our choices are all bad: leak blocks or potentially crash * with our caller not yet having saved previous checkpoint information * to stable storage. */ __wt_spin_lock(session, &block->live_lock); if (block->ckpt_inprogress) ret = __wt_block_panic(session, EINVAL, "%s: unexpected checkpoint ordering", block->name); else block->ckpt_inprogress = true; __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); /* * Extents newly available as a result of deleting previous checkpoints * are added to a list of extents. The list should be empty, but as * described above, there is no "free the checkpoint information" call * into the block manager; if there was an error in an upper level that * resulted in some previous checkpoint never being resolved, the list * may not be empty. We should have caught that with the "checkpoint * in progress" test, but it doesn't cost us anything to be cautious. * * We free the checkpoint's allocation and discard extent lists as part * of the resolution step, not because they're needed at that time, but * because it's potentially a lot of work, and waiting allows the btree * layer to continue eviction sooner. As for the checkpoint-available * list, make sure they get cleaned out. */ __wt_block_extlist_free(session, &ci->ckpt_avail); WT_RET(__wt_block_extlist_init( session, &ci->ckpt_avail, "live", "ckpt_avail", true)); __wt_block_extlist_free(session, &ci->ckpt_alloc); __wt_block_extlist_free(session, &ci->ckpt_discard); /* * To delete a checkpoint, we'll need checkpoint information for it and * the subsequent checkpoint into which it gets rolled; read them from * disk before we lock things down. */ deleting = false; WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; deleting = true; /* * Read the checkpoint and next checkpoint extent lists if we * haven't already read them (we may have already read these * extent blocks if there is more than one deleted checkpoint). */ if (ckpt->bpriv == NULL) WT_ERR(__ckpt_extlist_read(session, block, ckpt)); for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * The "next" checkpoint may be the live tree which has no * extent blocks to read. */ if (next_ckpt->bpriv == NULL && !F_ISSET(next_ckpt, WT_CKPT_ADD)) WT_ERR(__ckpt_extlist_read(session, block, next_ckpt)); } /* * Failures are now fatal: we can't currently back out the merge of any * deleted checkpoint extent lists into the live system's extent lists, * so continuing after error would leave the live system's extent lists * corrupted for any subsequent checkpoint (and potentially, should a * subsequent checkpoint succeed, for recovery). */ fatal = true; /* * Hold a lock so the live extent lists and the file size can't change * underneath us. I suspect we'll tighten this if checkpoints take too * much time away from real work: we read the historic checkpoint * information without a lock, but we could also merge and re-write the * deleted and merged checkpoint information without a lock, except for * the final merge of ranges into the live tree. */ __wt_spin_lock(session, &block->live_lock); locked = true; /* * We've allocated our last page, update the checkpoint size. We need * to calculate the live system's checkpoint size before merging * checkpoint allocation and discard information from the checkpoints * we're deleting, those operations change the underlying byte counts. */ ckpt_size = ci->ckpt_size; ckpt_size += ci->alloc.bytes; ckpt_size -= ci->discard.bytes; /* Skip the additional processing if we aren't deleting checkpoints. */ if (!deleting) goto live_update; /* * Delete any no-longer-needed checkpoints: we do this first as it frees * blocks to the live lists, and the freed blocks will then be included * when writing the live extent lists. */ WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; #ifdef HAVE_VERBOSE if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string( session, block, ckpt->raw.data, tmp)); __wt_verbose(session, WT_VERB_CHECKPOINT, "%s: delete-checkpoint: %s: %s", block->name, ckpt->name, (const char *)tmp->data); } #endif /* * Find the checkpoint into which we'll roll this checkpoint's * blocks: it's the next real checkpoint in the list, and it * better have been read in (if it's not the add slot). */ for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * Set the from/to checkpoint structures, where the "to" value * may be the live tree. */ a = ckpt->bpriv; if (F_ISSET(next_ckpt, WT_CKPT_ADD)) b = &block->live; else b = next_ckpt->bpriv; /* * Free the root page: there's nothing special about this free, * the root page is allocated using normal rules, that is, it * may have been taken from the avail list, and was entered on * the live system's alloc list at that time. We free it into * the checkpoint's discard list, however, not the live system's * list because it appears on the checkpoint's alloc list and so * must be paired in the checkpoint. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) WT_ERR(__wt_block_insert_ext(session, block, &a->discard, a->root_offset, a->root_size)); /* * Free the blocks used to hold the "from" checkpoint's extent * lists, including the avail list. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard)); /* * Roll the "from" alloc and discard extent lists into the "to" * checkpoint's lists. */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( session, block, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( session, block, &a->discard, &b->discard)); /* * If the "to" checkpoint is also being deleted, we're done with * it, it's merged into some other checkpoint in the next loop. * This means the extent lists may aggregate over a number of * checkpoints, but that's OK, they're disjoint sets of ranges. */ if (F_ISSET(next_ckpt, WT_CKPT_DELETE)) continue; /* * Find blocks for re-use: wherever the "to" checkpoint's * allocate and discard lists overlap, move the range to * the live system's checkpoint available list. */ WT_ERR(__wt_block_extlist_overlap(session, block, b)); /* * If we're updating the live system's information, we're done. */ if (F_ISSET(next_ckpt, WT_CKPT_ADD)) continue; /* * We have to write the "to" checkpoint's extent lists out in * new blocks, and update its cookie. * * Free the blocks used to hold the "to" checkpoint's extent * lists; don't include the avail list, it's not changing. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard)); F_SET(next_ckpt, WT_CKPT_UPDATE); } /* Update checkpoints marked for update. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_UPDATE)) WT_ERR(__ckpt_update( session, block, ckpt, ckpt->bpriv, false)); live_update: /* Truncate the file if that's possible. */ WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail)); /* Update the final, added checkpoint based on the live system. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_ADD)) { /* * !!! * Our caller wants the final checkpoint size. Setting * the size here violates layering, but the alternative * is a call for the btree layer to crack the checkpoint * cookie into its components, and that's a fair amount * of work. */ ckpt->ckpt_size = ckpt_size; /* * Set the rolling checkpoint size for the live system. * The current size includes the current checkpoint's * root page size (root pages are on the checkpoint's * block allocation list as root pages are allocated * with the usual block allocation functions). That's * correct, but we don't want to include it in the size * for the next checkpoint. */ ckpt_size -= ci->root_size; /* * Additionally, we had a bug for awhile where the live * checkpoint size grew without bound. We can't sanity * check the value, that would require walking the tree * as part of the checkpoint. Bound any bug at the size * of the file. * It isn't practical to assert that the value is within * bounds since databases created with older versions * of WiredTiger (2.8.0) would likely see an error. */ ci->ckpt_size = WT_MIN(ckpt_size, (uint64_t)block->size); WT_ERR(__ckpt_update(session, block, ckpt, ci, true)); } /* * Reset the live system's alloc and discard extent lists, leave the * avail list alone. This includes freeing a lot of extents, so do it * outside of the system's lock by copying and resetting the original, * then doing the work later. */ ci->ckpt_alloc = ci->alloc; WT_ERR(__wt_block_extlist_init( session, &ci->alloc, "live", "alloc", false)); ci->ckpt_discard = ci->discard; WT_ERR(__wt_block_extlist_init( session, &ci->discard, "live", "discard", false)); #ifdef HAVE_DIAGNOSTIC /* * The first checkpoint in the system should always have an empty * discard list. If we've read that checkpoint and/or created it, * check. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (!F_ISSET(ckpt, WT_CKPT_DELETE)) break; if ((a = ckpt->bpriv) == NULL) a = &block->live; if (a->discard.entries != 0) WT_ERR_MSG(session, WT_ERROR, "first checkpoint incorrectly has blocks on the discard " "list"); #endif err: if (ret != 0 && fatal) ret = __wt_block_panic(session, ret, "%s: fatal checkpoint failure", block->name); if (locked) __wt_spin_unlock(session, &block->live_lock); /* Discard any checkpoint information we loaded. */ WT_CKPT_FOREACH(ckptbase, ckpt) if ((ci = ckpt->bpriv) != NULL) __wt_block_ckpt_destroy(session, ci); __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_txn_update_oldest -- * Sweep the running transactions to update the oldest ID required. */ int __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; uint64_t current_id, last_running, oldest_id; uint64_t prev_last_running, prev_oldest_id; bool strict, wait; conn = S2C(session); txn_global = &conn->txn_global; strict = LF_ISSET(WT_TXN_OLDEST_STRICT); wait = LF_ISSET(WT_TXN_OLDEST_WAIT); current_id = last_running = txn_global->current; prev_last_running = txn_global->last_running; prev_oldest_id = txn_global->oldest_id; /* * For pure read-only workloads, or if the update isn't forced and the * oldest ID isn't too far behind, avoid scanning. */ if (prev_oldest_id == current_id || (!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100))) return (0); /* First do a read-only scan. */ if (wait) __wt_readlock(session, txn_global->scan_rwlock); else if ((ret = __wt_try_readlock(session, txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session); __wt_readunlock(session, txn_global->scan_rwlock); /* * If the state hasn't changed (or hasn't moved far enough for * non-forced updates), give up. */ if ((oldest_id == prev_oldest_id || (!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) && ((last_running == prev_last_running) || (!strict && WT_TXNID_LT(last_running, prev_last_running + 100)))) return (0); /* It looks like an update is necessary, wait for exclusive access. */ if (wait) __wt_writelock(session, txn_global->scan_rwlock); else if ((ret = __wt_try_writelock(session, txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); /* * If the oldest ID has been updated while we waited, don't bother * scanning. */ if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) && WT_TXNID_LE(last_running, txn_global->last_running)) goto done; /* * Re-scan now that we have exclusive access. This is necessary because * threads get transaction snapshots with read locks, and we have to be * sure that there isn't a thread that has got a snapshot locally but * not yet published its snap_min. */ __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session); #ifdef HAVE_DIAGNOSTIC { /* * Make sure the ID doesn't move past any named snapshots. * * Don't include the read/assignment in the assert statement. Coverity * complains if there are assignments only done in diagnostic builds, * and when the read is from a volatile. */ uint64_t id = txn_global->nsnap_oldest_id; WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); } #endif /* Update the oldest ID. */ if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; if (WT_TXNID_LT(txn_global->last_running, last_running)) { txn_global->last_running = last_running; #ifdef HAVE_VERBOSE /* Output a verbose message about long-running transactions, * but only when some progress is being made. */ if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && current_id - oldest_id > 10000 && oldest_session != NULL) { __wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %" PRIu32 " [%s]" " with snap_min %" PRIu64 "\n", oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min); } #endif } done: __wt_writeunlock(session, txn_global->scan_rwlock); return (ret); }
/* * __ckpt_update -- * Update a checkpoint. */ static int __ckpt_update(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, bool is_live) { WT_DECL_ITEM(tmp); WT_DECL_RET; uint8_t *endp; #ifdef HAVE_DIAGNOSTIC /* Check the extent list combinations for overlaps. */ WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail)); WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail)); WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard)); #endif /* * Write the checkpoint's alloc and discard extent lists. After each * write, remove any allocated blocks from the system's allocation * list, checkpoint extent blocks don't appear on any extent lists. */ WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL)); WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL)); /* * We only write an avail list for the live system, other checkpoint's * avail lists are static and never change. * * Write the avail list last so it reflects changes due to allocating * blocks for the alloc and discard lists. Second, when we write the * live system's avail list, it's two lists: the current avail list * plus the list of blocks to be made available when the new checkpoint * completes. We can't merge that second list into the real list yet, * it's not truly available until the new checkpoint locations have been * saved to the metadata. */ if (is_live) WT_RET(__wt_block_extlist_write( session, block, &ci->avail, &ci->ckpt_avail)); /* * Set the file size for the live system. * * !!! * We do NOT set the file size when re-writing checkpoints because we * want to test the checkpoint's blocks against a reasonable maximum * file size during verification. This is bad: imagine a checkpoint * appearing early in the file, re-written, and then the checkpoint * requires blocks at the end of the file, blocks after the listed file * size. If the application opens that checkpoint for writing * (discarding subsequent checkpoints), we would truncate the file to * the early chunk, discarding the re-written checkpoint information. * The alternative, updating the file size has its own problems, in * that case we'd work correctly, but we'd lose all of the blocks * between the original checkpoint and the re-written checkpoint. * Currently, there's no API to roll-forward intermediate checkpoints, * if there ever is, this will need to be fixed. */ if (is_live) ci->file_size = block->size; /* * Copy the checkpoint information into the checkpoint array's address * cookie. */ WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE)); endp = ckpt->raw.mem; WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci)); ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem); if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp)); __wt_verbose(session, WT_VERB_CHECKPOINT, "%s: create-checkpoint: %s: %s", block->name, ckpt->name, (const char *)tmp->data); } err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_block_checkpoint_load -- * Load a checkpoint. */ int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size, int readonly) { WT_BLOCK_CKPT *ci; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_UNUSED(addr_size); /* * Sometimes we don't find a root page (we weren't given a checkpoint, * or the referenced checkpoint was empty). In that case we return a * root page size of 0. Set that up now. */ dsk->size = 0; ci = &block->live; WT_RET(__wt_block_ckpt_init(session, block, ci, "live", 1)); if (WT_VERBOSE_ISSET(session, ckpt)) { if (addr != NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string(session, block, addr, tmp)); } WT_VERBOSE_ERR(session, ckpt, "%s: load-checkpoint: %s", block->name, addr == NULL ? "[Empty]" : (char *)tmp->data); } /* If not loading a checkpoint from disk, we're done. */ if (addr == NULL || addr_size == 0) return (0); /* Crack the checkpoint cookie. */ if (addr != NULL) WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci)); /* Verify sets up next. */ if (block->verify) WT_ERR(__wt_verify_ckpt_load(session, block, ci)); /* Read, and optionally verify, any root page. */ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) { WT_ERR(__wt_block_read_off(session, block, dsk, ci->root_offset, ci->root_size, ci->root_cksum)); if (block->verify) { if (tmp == NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string( session, block, addr, tmp)); } WT_ERR( __wt_verify_dsk(session, (char *)tmp->data, dsk)); } } /* * Rolling a checkpoint forward requires the avail list, the blocks from * which we can allocate. */ if (!readonly) WT_ERR( __wt_block_extlist_read_avail(session, block, &ci->avail)); /* * If the checkpoint can be written, that means anything written after * the checkpoint is no longer interesting, truncate the file. Don't * bother checking the avail list for a block at the end of the file, * that was done when the checkpoint was first written (re-writing the * checkpoint might possibly make it relevant here, but it's unlikely * enough that I'm not bothering). */ if (!readonly) { WT_VERBOSE_ERR(session, ckpt, "truncate file to %" PRIuMAX, (uintmax_t)ci->file_size); WT_ERR(__wt_ftruncate(session, block->fh, ci->file_size)); } if (0) { err: (void)__wt_block_checkpoint_unload(session, block); } __wt_scr_free(&tmp); return (ret); }