/* * __wt_btcur_update_check -- * Check whether an update would conflict. * * This can be used to replace WT_CURSOR::insert or WT_CURSOR::update, so * they only check for conflicts without updating the tree. It is used to * maintain snapshot isolation for transactions that span multiple chunks * in an LSM tree. */ int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; cursor = &cbt->iface; btree = cbt->btree; session = (WT_SESSION_IMPL *)cursor->session; retry: WT_RET(__cursor_func_init(cbt, 1)); switch (btree->type) { case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, NULL, 1)); /* * Just check for conflicts. */ ret = __curfile_update_check(cbt); break; case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ILLEGAL_VALUE_ERR(session); } err: if (ret == WT_RESTART) goto retry; WT_TRET(__curfile_leave(cbt)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_update -- * Update a record in the tree. */ int __wt_btcur_update(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_updates); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); retry: __cursor_func_init(cbt, 1); switch (btree->type) { case BTREE_COL_FIX: if (cursor->value.size != 1) WT_RET_MSG(session, EINVAL, "item size of %" PRIu32 " does not match " "fixed-length file requirement of 1 byte", cursor->value.size); /* FALLTHROUGH */ case BTREE_COL_VAR: WT_ERR(__wt_col_search(session, cbt, 1)); /* * Update the record if it exists. Creating a record past the * end of the tree in a fixed-length column-store implicitly * fills the gap with empty records. Update the record in that * case, the record exists. */ if ((cbt->compare != 0 || __cursor_invalid(cbt)) && !__cursor_fix_implicit(btree, cbt)) ret = WT_NOTFOUND; else if ((ret = __wt_col_modify(session, cbt, 3)) == WT_RESTART) goto retry; break; case BTREE_ROW: /* Update the record it it exists. */ WT_ERR(__wt_row_search(session, cbt, 1)); if (cbt->compare != 0 || __cursor_invalid(cbt)) ret = WT_NOTFOUND; else if ((ret = __wt_row_modify(session, cbt, 0)) == WT_RESTART) goto retry; break; WT_ILLEGAL_VALUE_ERR(session); } err: __cursor_func_resolve(cbt, ret); return (ret); }
/* * __wt_bt_cache_op -- * Cache operations: compaction, discard, sync/checkpoint. */ int __wt_bt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op) { WT_DECL_RET; WT_BTREE *btree; btree = session->btree; /* * Compaction and sync/checkpoint reconcile dirty pages from the cache * to the backing block manager. Reconciliation is just another reader * of the page, so with some care, it can be done in the current thread, * leaving the eviction thread to keep freeing spaces if the cache is * full. Sync and eviction cannot operate on the same page at the same * time, and there are different modes inside __wt_tree_walk to make * sure they don't trip over each other. * * The current thread cannot evict pages from the cache, so discard is * done by calling the eviction server for service. * * XXX * Set the checkpoint reference for reconciliation -- this is ugly, but * there's no data structure path from here to reconciliation. * * Publish: there must be a barrier to ensure the structure fields are * set before the eviction thread can see the request. */ WT_PUBLISH(btree->ckpt, ckptbase); switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_COMPACT: case WT_SYNC_WRITE_LEAVES: WT_ERR(__wt_sync_file(session, op)); break; case WT_SYNC_DISCARD: case WT_SYNC_DISCARD_NOWRITE: /* * Schedule and wake the eviction server, then wait for the * eviction server to wake us. */ WT_ERR(__wt_sync_file_serial(session, op)); WT_ERR(__wt_evict_server_wake(session)); WT_ERR(__wt_cond_wait(session, session->cond, 0)); ret = session->syncop_ret; /* If discarding the tree, the root page should be gone. */ WT_ASSERT(session, ret != 0 || btree->root_page == NULL); break; WT_ILLEGAL_VALUE_ERR(session); } err: btree->ckpt = NULL; return (ret); }
/* * __wt_btcur_remove -- * Remove a record from the tree. */ int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_removes); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); retry: __cursor_func_init(cbt, 1); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__wt_col_search(session, cbt, 1)); /* * Remove the record if it exists. Creating a record past the * end of the tree in a fixed-length column-store implicitly * fills the gap with empty records. Return success in that * case, the record was deleted successfully. */ if (cbt->compare != 0 || __cursor_invalid(cbt)) ret = __cursor_fix_implicit(btree, cbt) ? 0 : WT_NOTFOUND; else if ((ret = __wt_col_modify(session, cbt, 2)) == WT_RESTART) goto retry; break; case BTREE_ROW: /* Remove the record if it exists. */ WT_ERR(__wt_row_search(session, cbt, 1)); if (cbt->compare != 0 || __cursor_invalid(cbt)) ret = WT_NOTFOUND; else if ((ret = __wt_row_modify(session, cbt, 1)) == WT_RESTART) goto retry; break; WT_ILLEGAL_VALUE_ERR(session); } err: __cursor_func_resolve(cbt, ret); return (ret); }
/* * __wt_cache_op -- * Cache operations. */ int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op) { WT_DECL_RET; WT_BTREE *btree; btree = S2BT(session); switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_CLOSE: /* * Set the checkpoint reference for reconciliation; it's ugly, * but drilling a function parameter path from our callers to * the reconciliation of the tree's root page is going to be * worse. */ WT_ASSERT(session, btree->ckpt == NULL); btree->ckpt = ckptbase; break; } switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_WRITE_LEAVES: WT_ERR(__sync_file(session, op)); break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: case WT_SYNC_DISCARD_FORCE: WT_ERR(__wt_evict_file(session, op)); break; WT_ILLEGAL_VALUE_ERR(session); } err: switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_CLOSE: btree->ckpt = NULL; break; } return (ret); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int discard) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_DSTAT_INCR(session, cursor_prev); flags = WT_TREE_SKIP_INTL | WT_TREE_PREV; /* Tree walk flags. */ if (discard) LF_SET(WT_TREE_DISCARD); retry: WT_RET(__cursor_func_init(cbt, 0)); __cursor_position_clear(cbt); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt, 0); /* * If this is a modification, we're about to read information from the * page, save the write generation. */ page = cbt->page; if (discard && page != NULL) { WT_ERR(__wt_page_modify_init(session, page)); WT_ORDERED_READ(cbt->write_gen, page->modify->write_gen); } /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ for (newpage = 0;; newpage = 1) { if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = 1; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } cbt->page = NULL; WT_ERR(__wt_tree_walk(session, &page, flags)); WT_ERR_TEST(page == NULL, WT_NOTFOUND); WT_ASSERT(session, page->type != WT_PAGE_COL_INT && page->type != WT_PAGE_ROW_INT); cbt->page = page; /* Initialize the page's modification information */ if (discard) { WT_ERR(__wt_page_modify_init(session, page)); WT_ORDERED_READ( cbt->write_gen, page->modify->write_gen); } /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); } err: if (ret == WT_RESTART) goto retry; WT_TRET(__cursor_func_resolve(cbt, ret)); return (ret); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; bool newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_prev); WT_STAT_FAST_DATA_INCR(session, cursor_prev); flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */ if (truncating) LF_SET(WT_READ_TRUNCATE); WT_RET(__cursor_func_init(cbt, false)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = true; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } /* * If we saw a lot of deleted records on this page, or we went * all the way through a page and only saw deleted records, try * to evict the page when we release it. Otherwise repeatedly * deleting from the beginning of a tree can have quadratic * performance. Take care not to force eviction of pages that * are genuinely empty, in new trees. */ if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))) __wt_page_evict_soon(page); cbt->page_deleted_count = 0; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_remove -- * Remove a record from the tree. */ int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_FAST_CONN_INCR(session, cursor_remove); WT_STAT_FAST_DATA_INCR(session, cursor_remove); WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); retry: WT_RET(__cursor_func_init(cbt, 1)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__cursor_col_search(session, cbt)); /* Remove the record if it exists. */ if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) { if (!__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); /* * Creating a record past the end of the tree in a * fixed-length column-store implicitly fills the * gap with empty records. Return success in that * case, the record was deleted successfully. * * Correct the btree cursor's location: the search * will have pointed us at the previous/next item, * and that's not correct. */ cbt->recno = cursor->recno; } else ret = __cursor_col_modify(session, cbt, 1); break; case BTREE_ROW: /* Remove the record if it exists. */ WT_ERR(__cursor_row_search(session, cbt, 0)); if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); ret = __cursor_row_modify(session, cbt, 1); break; WT_ILLEGAL_VALUE_ERR(session); } err: if (ret == WT_RESTART) goto retry; /* * If the cursor is configured to overwrite and the record is not * found, that is exactly what we want. */ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND) ret = 0; if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_next -- * Move to the next record in the tree. */ int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) { WT_CURSOR *cursor; WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; bool newpage; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_RET(__cursor_func_init(cbt, false)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT)) __wt_btcur_iterate_setup(cbt); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the next page, until we reach the end of the * file. */ flags = WT_READ_SKIP_INTL; /* tree walk flags */ if (truncating) LF_SET(WT_READ_TRUNCATE); for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; } else if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_next(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; /* * Column-store pages may have appended entries. Handle * it separately from the usual cursor code, it's in a * simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) { F_SET(cbt, WT_CBT_ITERATE_APPEND); continue; } } /* * If we saw a lot of deleted records on this page, or we went * all the way through a page and only saw deleted records, try * to evict the page when we release it. Otherwise repeatedly * deleting from the beginning of a tree can have quadratic * performance. Take care not to force eviction of pages that * are genuinely empty, in new trees. */ if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))) __wt_page_evict_soon(session, cbt->ref); cbt->page_deleted_count = 0; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_check(session, cbt, true)); #endif if (ret == 0) F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { struct timespec end, start; WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_snap_min; uint32_t flags; conn = S2C(session); btree = S2BT(session); walk = NULL; txn = &session->txn; saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min; flags = WT_READ_CACHE | WT_READ_NO_GEN; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) WT_RET(__wt_epoch(session, &start)); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } /* * Save the oldest transaction ID we need to keep around. * Otherwise, in a busy system, we could be updating pages so * fast that write leaves never catches up. We deliberately * have no transaction running at this point that would keep * the oldest ID from moving forwards as we walk the tree. */ oldest_id = __wt_txn_oldest_id(session); flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write hot pages (defined as pages that have * been updated since the write phase leaves started): * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; case WT_SYNC_CHECKPOINT: /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make * schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. * We don't update the snapshot in between pages because (a) * the metadata shouldn't be that big, and (b) if we do ever */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * In the final checkpoint pass, child pages cannot be evicted * from underneath internal pages nor can underlying blocks be * freed until the checkpoint's block lists are stable. Also, * we cannot split child pages into parents unless we know the * final pass will write a consistent view of that namespace. * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE); WT_ERR(__wt_evict_file_exclusive_on(session)); __wt_evict_file_exclusive_off(session); WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING); /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* Skip clean pages. */ if (!__wt_page_is_modified(walk->page)) continue; /* * Take a local reference to the page modify structure * now that we know the page is dirty. It needs to be * done in this order otherwise the page modify * structure could have been created between taking the * reference and checking modified. */ page = walk->page; mod = page->modify; /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. * * We can skip dirty pages if: * (1) they are leaf pages; * (2) there is a snapshot transaction active (which * is the case in ordinary application checkpoints * but not all internal cases); and * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. * * Mark the tree dirty: the checkpoint marked it clean * and we can't skip future checkpoints until this page * is written. */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { __wt_page_modify_set(session, page); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: WT_ILLEGAL_VALUE_ERR(session); } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_ERR(__wt_epoch(session, &end)); WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 " bytes, %" PRIu64 " pages of internal\n\t" "Took: %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, WT_TIMEDIFF_MS(end, start))); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); /* * If we got a snapshot in order to write pages, and there was no * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && saved_snap_min == WT_TXN_NONE) __wt_txn_release_snapshot(session); if (btree->checkpointing != WT_CKPT_OFF) { /* * Update the checkpoint generation for this handle so visible * updates newer than the checkpoint can be evicted. * * This has to be published before eviction is enabled again, * so that eviction knows that the checkpoint has completed. */ WT_PUBLISH(btree->checkpoint_gen, conn->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. */ btree->checkpointing = WT_CKPT_OFF; WT_FULL_BARRIER(); /* * If this tree was being skipped by the eviction server during * the checkpoint, clear the wait. */ btree->evict_walk_period = 0; /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ WT_TRET(__wt_evict_server_wake(session)); } __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, true)); return (ret); }
/*将btree cursor移动到下一个记录*/ int __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); /*btree 扫描标示*/ flags = WT_READ_SKIP_INTL; if (truncating) LF_SET(WT_READ_TRUNCATE); /*激活一个btree cursor*/ WT_RET(__cursor_func_init(cbt, 0)); /*初始化cursor*/ if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT)) __wt_btcur_iterate_setup(cbt, 1); /*对btree的扫描*/ for (;;){ page = cbt->ref == NULL ? NULL : cbt->ref->page; WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); /*column store append方式,在insert header上做扫描*/ if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)){ switch (page->type){ case WT_PAGE_COL_FIX: ret = __cursor_fix_append_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; /*清除掉column store的标记*/ F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; } else if (page != NULL){ switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_next(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } /*找到对应的记录了,直接返回*/ if (ret != WT_NOTFOUND) break; /*假如是column store方式,检查是否要扫描insert header list*/ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) { F_SET(cbt, WT_CBT_ITERATE_APPEND); continue; } } /*删除的记录太多,对page进行重组,增大page的填充因子*/ if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))){ __wt_page_evict_soon(page); } cbt->page_deleted_count = 0; /*btree cursor跳转到下一个page上*/ WT_ERR(__wt_tree_walk(session, &cbt->ref, NULL, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } err: if (ret != 0) /*失败了,恢复cursor的状态*/ WT_TRET(__cursor_reset(cbt)); return ret; }
/* * __wt_btcur_next -- * Move to the next record in the tree. */ int __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int skipped, newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); flags = WT_READ_SKIP_INTL; /* Tree walk flags. */ if (truncating) LF_SET(WT_READ_TRUNCATE); WT_RET(__cursor_func_init(cbt, 0)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT)) __wt_btcur_iterate_setup(cbt, 1); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the next page, until we reach the end of the * file. */ for (skipped = newpage = 0;; skipped = 0, newpage = 1) { page = cbt->ref == NULL ? NULL : cbt->ref->page; WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_next( cbt, newpage, &skipped); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; } else if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_next(cbt, newpage, &skipped); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_next(cbt, newpage, &skipped); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) { F_SET(cbt, WT_CBT_ITERATE_APPEND); continue; } } /* * If we scanned all the way through a page and only saw * deleted records, try to evict the page as we release it. * Otherwise repeatedly deleting from the beginning of a tree * can have quadratic performance. */ if (newpage && skipped) page->read_gen = WT_READGEN_OLDEST; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_txn_checkpoint_log -- * Write a log record for a checkpoint operation. */ int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, int full, uint32_t flags, WT_LSN *lsnp) { WT_DECL_ITEM(logrec); WT_DECL_RET; WT_ITEM *ckpt_snapshot, empty; WT_LSN *ckpt_lsn; WT_TXN *txn; uint8_t *end, *p; size_t recsize; uint32_t i, rectype = WT_LOGREC_CHECKPOINT; const char *fmt = WT_UNCHECKED_STRING(IIQIU); txn = &session->txn; ckpt_lsn = &txn->ckpt_lsn; /* * If this is a file sync, log it unless there is a full checkpoint in * progress. */ if (!full) { if (txn->full_ckpt) { if (lsnp != NULL) *lsnp = *ckpt_lsn; return (0); } return (__txn_log_file_sync(session, flags, lsnp)); } switch (flags) { case WT_TXN_LOG_CKPT_PREPARE: txn->full_ckpt = 1; WT_ERR(__wt_log_ckpt_lsn(session, ckpt_lsn)); /* * We need to make sure that the log records in the checkpoint * LSN are on disk. In particular to make sure that the * current log file exists. */ WT_ERR(__wt_log_force_sync(session, ckpt_lsn)); break; case WT_TXN_LOG_CKPT_START: /* Take a copy of the transaction snapshot. */ txn->ckpt_nsnapshot = txn->snapshot_count; recsize = txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE; WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot)); p = txn->ckpt_snapshot->mem; end = p + recsize; for (i = 0; i < txn->snapshot_count; i++) WT_ERR(__wt_vpack_uint( &p, WT_PTRDIFF(end, p), txn->snapshot[i])); break; case WT_TXN_LOG_CKPT_STOP: /* * During a clean connection close, we get here without the * prepare or start steps. In that case, log the current LSN * as the checkpoint LSN. */ if (!txn->full_ckpt) { txn->ckpt_nsnapshot = 0; WT_CLEAR(empty); ckpt_snapshot = ∅ WT_ERR(__wt_log_ckpt_lsn(session, ckpt_lsn)); } else ckpt_snapshot = txn->ckpt_snapshot; /* Write the checkpoint log record. */ WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype, ckpt_lsn->file, ckpt_lsn->offset, txn->ckpt_nsnapshot, ckpt_snapshot)); WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype, ckpt_lsn->file, ckpt_lsn->offset, txn->ckpt_nsnapshot, ckpt_snapshot)); logrec->size += (uint32_t)recsize; WT_ERR(__wt_log_write(session, logrec, lsnp, F_ISSET(S2C(session), WT_CONN_CKPT_SYNC) ? WT_LOG_FSYNC : 0)); /* * If this full checkpoint completed successfully and there is * no hot backup in progress, tell the logging subsystem the * checkpoint LSN so that it can archive. */ if (!S2C(session)->hot_backup) WT_ERR(__wt_log_ckpt(session, ckpt_lsn)); /* FALLTHROUGH */ case WT_TXN_LOG_CKPT_CLEANUP: /* Cleanup any allocated resources */ WT_INIT_LSN(ckpt_lsn); txn->ckpt_nsnapshot = 0; __wt_scr_free(session, &txn->ckpt_snapshot); txn->full_ckpt = 0; break; WT_ILLEGAL_VALUE_ERR(session); } err: __wt_logrec_free(session, &logrec); return (ret); }
/* * __wt_btcur_insert -- * Insert a record into the tree. */ int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_inserts); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); retry: __cursor_func_init(cbt, 1); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: /* * If WT_CURSTD_APPEND is set, insert a new record (ignoring * the application's record number). First we search for the * maximum possible record number so the search ends on the * last page. The real record number is assigned by the * serialized append operation. * __wt_col_append_serial_func */ if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = UINT64_MAX; WT_ERR(__wt_col_search(session, cbt, 1)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = 0; /* * If WT_CURSTD_OVERWRITE set, insert/update the key/value pair. * * If WT_CURSTD_OVERWRITE not set, fail if the key exists, else * insert the key/value pair. Creating a record past the end * of the tree in a fixed-length column-store implicitly fills * the gap with empty records. Fail in that case, the record * exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ((cbt->compare == 0 && !__cursor_invalid(cbt)) || (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) { ret = WT_DUPLICATE_KEY; break; } if ((ret = __wt_col_modify(session, cbt, 3)) == WT_RESTART) goto retry; if (F_ISSET(cursor, WT_CURSTD_APPEND) && ret == 0) cbt->iface.recno = cbt->recno; break; case BTREE_ROW: /* * If WT_CURSTD_OVERWRITE not set, fail if the key exists, else * insert the key/value pair. * * If WT_CURSTD_OVERWRITE set, insert/update the key/value pair. */ WT_ERR(__wt_row_search(session, cbt, 1)); if (cbt->compare == 0 && !__cursor_invalid(cbt) && !F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { ret = WT_DUPLICATE_KEY; break; } if ((ret = __wt_row_modify(session, cbt, 0)) == WT_RESTART) goto retry; break; WT_ILLEGAL_VALUE_ERR(session); } err: __cursor_func_resolve(cbt, ret); return (ret); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_prev); WT_STAT_FAST_DATA_INCR(session, cursor_prev); flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */ if (truncating) LF_SET(WT_READ_TRUNCATE); WT_RET(__cursor_func_init(cbt, 0)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt, 0); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ page = cbt->ref == NULL ? NULL : cbt->ref->page; for (newpage = 0;; newpage = 1) { if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = 1; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); page = cbt->ref->page; WT_ASSERT(session, page->type != WT_PAGE_COL_INT && page->type != WT_PAGE_ROW_INT); /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); } err: if (ret != 0) WT_TRET(__cursor_error_resolve(cbt)); return (ret); }
/* * __wt_evict_file -- * Discard pages for a specific file. */ int __wt_evict_file(WT_SESSION_IMPL *session, int syncop) { WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; bool evict_reset; /* * We need exclusive access to the file -- disable ordinary eviction * and drain any blocks already queued. */ WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset)); /* Make sure the oldest transaction ID is up-to-date. */ __wt_txn_update_oldest(session, true); /* Walk the tree, discarding pages. */ next_ref = NULL; WT_ERR(__wt_tree_walk(session, &next_ref, NULL, WT_READ_CACHE | WT_READ_NO_EVICT)); while ((ref = next_ref) != NULL) { page = ref->page; /* * Eviction can fail when a page in the evicted page's subtree * switches state. For example, if we don't evict a page marked * empty, because we expect it to be merged into its parent, it * might no longer be empty after it's reconciled, in which case * eviction of its parent would fail. We can either walk the * tree multiple times (until it's finally empty), or reconcile * each page to get it to its final state before considering if * it's an eviction target or will be merged into its parent. * * Don't limit this test to any particular page type, that tends * to introduce bugs when the reconciliation of other page types * changes, and there's no advantage to doing so. * * Eviction can also fail because an update cannot be written. * If sessions have disjoint sets of files open, updates in a * no-longer-referenced file may not yet be globally visible, * and the write will fail with EBUSY. Our caller handles that * error, retrying later. */ if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING)); /* * We can't evict the page just returned to us (it marks our * place in the tree), so move the walk to one page ahead of * the page being evicted. Note, we reconciled the returned * page first: if reconciliation of that page were to change * the shape of the tree, and we did the next walk call before * the reconciliation, the next walk call could miss a page in * the tree. */ WT_ERR(__wt_tree_walk(session, &next_ref, NULL, WT_READ_CACHE | WT_READ_NO_EVICT)); switch (syncop) { case WT_SYNC_CLOSE: /* * Evict the page. */ WT_ERR(__wt_evict(session, ref, 1)); break; case WT_SYNC_DISCARD: /* * Dead handles may reference dirty pages; clean the * page, both to keep statistics correct, and to let * the page-discard function assert no dirty page is * ever discarded. */ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) __wt_page_modify_clear(session, page); WT_ASSERT(session, F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || __wt_page_can_evict(session, ref, false, NULL)); __wt_evict_page_clean_update(session, ref, 1); break; WT_ILLEGAL_VALUE_ERR(session); } } if (0) { err: /* On error, clear any left-over tree walk. */ if (next_ref != NULL) WT_TRET(__wt_page_release( session, next_ref, WT_READ_NO_EVICT)); } if (evict_reset) __wt_evict_file_exclusive_off(session); return (ret); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt) { WT_DECL_RET; WT_SESSION_IMPL *session; int newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_BSTAT_INCR(session, cursor_read_prev); __cursor_func_init(cbt, 0); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt, 0); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ for (newpage = 0;; newpage = 1) { if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (cbt->page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = 1; } if (cbt->page != NULL) { switch (cbt->page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } do { WT_ERR(__wt_tree_np(session, &cbt->page, 0, 0)); WT_ERR_TEST(cbt->page == NULL, WT_NOTFOUND); } while ( cbt->page->type == WT_PAGE_COL_INT || cbt->page->type == WT_PAGE_ROW_INT); /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (cbt->page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(cbt->page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); } err: __cursor_func_resolve(cbt, ret); return (ret); }
/* * __las_page_instantiate -- * Instantiate lookaside update records in a recently read page. */ static int __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size) { WT_CURSOR *cursor; WT_CURSOR_BTREE cbt; WT_DECL_ITEM(current_key); WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_ITEM(las_value); WT_DECL_RET; WT_PAGE *page; WT_UPDATE *first_upd, *last_upd, *upd; size_t incr, total_incr; uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid; uint32_t las_id, upd_size, session_flags; int exact; const uint8_t *p; cursor = NULL; page = ref->page; first_upd = last_upd = upd = NULL; total_incr = 0; current_recno = recno = WT_RECNO_OOB; session_flags = 0; /* [-Werror=maybe-uninitialized] */ __wt_btcur_init(session, &cbt); __wt_btcur_open(&cbt); WT_ERR(__wt_scr_alloc(session, 0, ¤t_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_value)); /* Open a lookaside table cursor. */ WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * The lookaside records are in key and update order, that is, there * will be a set of in-order updates for a key, then another set of * in-order updates for a subsequent key. We process all of the updates * for a key and then insert those updates into the page, then all the * updates for the next key, and so on. * * Search for the block's unique prefix, stepping through any matching * records. */ las_addr->data = addr; las_addr->size = addr_size; las_key->size = 0; cursor->set_key( cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) ret = cursor->next(cursor); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * Confirm the search using the unique prefix; if not a match, * we're done searching for records for this page. */ if (las_id != read_id || las_addr->size != addr_size || memcmp(las_addr->data, addr, addr_size) != 0) break; /* * If the on-page value has become globally visible, this record * is no longer needed. */ if (__wt_txn_visible_all(session, las_txnid)) continue; /* Allocate the WT_UPDATE structure. */ WT_ERR(cursor->get_value( cursor, &upd_txnid, &upd_size, las_value)); WT_ERR(__wt_update_alloc(session, (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value, &upd, &incr)); total_incr += incr; upd->txnid = upd_txnid; switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: p = las_key->data; WT_ERR(__wt_vunpack_uint(&p, 0, &recno)); if (current_recno == recno) break; WT_ASSERT(session, current_recno < recno); if (first_upd != NULL) { WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; } current_recno = recno; break; case WT_PAGE_ROW_LEAF: if (current_key->size == las_key->size && memcmp(current_key->data, las_key->data, las_key->size) == 0) break; if (first_upd != NULL) { WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; } WT_ERR(__wt_buf_set(session, current_key, las_key->data, las_key->size)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Append the latest update to the list. */ if (first_upd == NULL) first_upd = last_upd = upd; else { last_upd->next = upd; last_upd = upd; } upd = NULL; } WT_ERR_NOTFOUND_OK(ret); /* Insert the last set of updates, if any. */ if (first_upd != NULL) switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; break; case WT_PAGE_ROW_LEAF: WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; break; WT_ILLEGAL_VALUE_ERR(session); } /* Discard the cursor. */ WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags)); if (total_incr != 0) { __wt_cache_page_inmem_incr(session, page, total_incr); /* * We've modified/dirtied the page, but that's not necessary and * if we keep the page clean, it's easier to evict. We leave the * lookaside table updates in place, so if we evict this page * without dirtying it, any future instantiation of it will find * the records it needs. If the page is dirtied before eviction, * then we'll write any needed lookaside table records for the * new location of the page. */ __wt_page_modify_clear(session, page); } err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); WT_TRET(__wt_btcur_close(&cbt, 1)); /* * On error, upd points to a single unlinked WT_UPDATE structure, * first_upd points to a list. */ if (upd != NULL) __wt_free(session, upd); if (first_upd != NULL) __wt_free_update_list(session, first_upd); __wt_scr_free(session, ¤t_key); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); __wt_scr_free(session, &las_value); return (ret); }
/* * __wt_page_inmem -- * Build in-memory page information. */ int __wt_page_inmem( WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *parent_ref, WT_PAGE_HEADER *dsk, int disk_not_alloc, WT_PAGE **pagep) { WT_DECL_RET; WT_PAGE *page; uint32_t alloc_entries; size_t size; alloc_entries = 0; *pagep = NULL; /* * Figure out how many underlying objects the page references so * we can allocate them along with the page. */ switch (dsk->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: /* * Column-store internal page entries map one-to-one to the * number of physical entries on the page (each physical entry * is an offset object). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_COL_VAR: /* * Column-store leaf page entries map one-to-one to the number * of physical entries on the page (each physical entry is a * data item). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_ROW_INT: /* * Row-store internal page entries map one-to-two to the number * of physical entries on the page (each in-memory entry is a * key item and location cookie). */ alloc_entries = dsk->u.entries / 2; break; case WT_PAGE_ROW_LEAF: /* * Row-store leaf page entries map in an indeterminate way to * the physical entries on the page, we have to walk the page * to figure it out. */ WT_RET(__inmem_row_leaf_entries(session, dsk, &alloc_entries)); break; WT_ILLEGAL_VALUE(session); } /* Allocate and initialize a new WT_PAGE. */ WT_RET(__wt_page_alloc(session, dsk->type, alloc_entries, &page)); page->dsk = dsk; page->read_gen = WT_READ_GEN_NOTSET; if (disk_not_alloc) F_SET_ATOMIC(page, WT_PAGE_DISK_NOT_ALLOC); /* * Track the memory allocated to build this page so we can update the * cache statistics in a single call. */ size = disk_not_alloc ? 0 : dsk->mem_size; switch (page->type) { case WT_PAGE_COL_FIX: page->entries = dsk->u.entries; page->u.col_fix.recno = dsk->recno; __inmem_col_fix(session, page); break; case WT_PAGE_COL_INT: page->entries = dsk->u.entries; page->u.intl.recno = dsk->recno; __inmem_col_int(session, page); break; case WT_PAGE_COL_VAR: page->entries = dsk->u.entries; page->u.col_var.recno = dsk->recno; WT_ERR(__inmem_col_var(session, page, &size)); break; case WT_PAGE_ROW_INT: page->entries = dsk->u.entries / 2; WT_ERR(__inmem_row_int(session, page, &size)); break; case WT_PAGE_ROW_LEAF: page->entries = alloc_entries; WT_ERR(__inmem_row_leaf(session, page)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Update the page's in-memory size and the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); /* Link the new page into the parent. */ if (parent_ref != NULL) WT_LINK_PAGE(parent, parent_ref, page); *pagep = page; return (0); err: __wt_page_out(session, &page); return (ret); }
/* * __txn_op_apply -- * Apply a transactional operation during recovery. */ static int __txn_op_apply( WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end) { WT_CURSOR *cursor, *start, *stop; WT_DECL_RET; WT_ITEM key, start_key, stop_key, value; WT_SESSION_IMPL *session; uint64_t recno, start_recno, stop_recno; uint32_t fileid, mode, optype, opsize; session = r->session; cursor = NULL; /* Peek at the size and the type. */ WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize)); end = *pp + opsize; switch (optype) { case WT_LOGOP_COL_MODIFY: WT_ERR(__wt_logop_col_modify_unpack(session, pp, end, &fileid, &recno, &value)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); cursor->set_key(cursor, recno); if ((ret = cursor->search(cursor)) != 0) WT_ERR_NOTFOUND_OK(ret); else { /* * Build/insert a complete value during recovery rather * than using cursor modify to create a partial update * (for no particular reason than simplicity). */ WT_ERR(__wt_modify_apply(session, cursor, value.data)); WT_ERR(cursor->insert(cursor)); } break; case WT_LOGOP_COL_PUT: WT_ERR(__wt_logop_col_put_unpack(session, pp, end, &fileid, &recno, &value)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); cursor->set_key(cursor, recno); __wt_cursor_set_raw_value(cursor, &value); WT_ERR(cursor->insert(cursor)); break; case WT_LOGOP_COL_REMOVE: WT_ERR(__wt_logop_col_remove_unpack(session, pp, end, &fileid, &recno)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); cursor->set_key(cursor, recno); WT_ERR(cursor->remove(cursor)); break; case WT_LOGOP_COL_TRUNCATE: WT_ERR(__wt_logop_col_truncate_unpack(session, pp, end, &fileid, &start_recno, &stop_recno)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); /* Set up the cursors. */ if (start_recno == WT_RECNO_OOB) { start = NULL; stop = cursor; } else if (stop_recno == WT_RECNO_OOB) { start = cursor; stop = NULL; } else { start = cursor; WT_ERR(__recovery_cursor( session, r, lsnp, fileid, true, &stop)); } /* Set the keys. */ if (start != NULL) start->set_key(start, start_recno); if (stop != NULL) stop->set_key(stop, stop_recno); WT_TRET(session->iface.truncate(&session->iface, NULL, start, stop, NULL)); /* If we opened a duplicate cursor, close it now. */ if (stop != NULL && stop != cursor) WT_TRET(stop->close(stop)); WT_ERR(ret); break; case WT_LOGOP_ROW_MODIFY: WT_ERR(__wt_logop_row_modify_unpack(session, pp, end, &fileid, &key, &value)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); __wt_cursor_set_raw_key(cursor, &key); if ((ret = cursor->search(cursor)) != 0) WT_ERR_NOTFOUND_OK(ret); else { /* * Build/insert a complete value during recovery rather * than using cursor modify to create a partial update * (for no particular reason than simplicity). */ WT_ERR(__wt_modify_apply(session, cursor, value.data)); WT_ERR(cursor->insert(cursor)); } break; case WT_LOGOP_ROW_PUT: WT_ERR(__wt_logop_row_put_unpack(session, pp, end, &fileid, &key, &value)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); __wt_cursor_set_raw_key(cursor, &key); __wt_cursor_set_raw_value(cursor, &value); WT_ERR(cursor->insert(cursor)); break; case WT_LOGOP_ROW_REMOVE: WT_ERR(__wt_logop_row_remove_unpack(session, pp, end, &fileid, &key)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); __wt_cursor_set_raw_key(cursor, &key); WT_ERR(cursor->remove(cursor)); break; case WT_LOGOP_ROW_TRUNCATE: WT_ERR(__wt_logop_row_truncate_unpack(session, pp, end, &fileid, &start_key, &stop_key, &mode)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); /* Set up the cursors. */ start = stop = NULL; switch (mode) { case WT_TXN_TRUNC_ALL: /* Both cursors stay NULL. */ break; case WT_TXN_TRUNC_BOTH: start = cursor; WT_ERR(__recovery_cursor( session, r, lsnp, fileid, true, &stop)); break; case WT_TXN_TRUNC_START: start = cursor; break; case WT_TXN_TRUNC_STOP: stop = cursor; break; WT_ILLEGAL_VALUE_ERR(session, mode); } /* Set the keys. */ if (start != NULL) __wt_cursor_set_raw_key(start, &start_key); if (stop != NULL) __wt_cursor_set_raw_key(stop, &stop_key); WT_TRET(session->iface.truncate(&session->iface, NULL, start, stop, NULL)); /* If we opened a duplicate cursor, close it now. */ if (stop != NULL && stop != cursor) WT_TRET(stop->close(stop)); WT_ERR(ret); break; WT_ILLEGAL_VALUE_ERR(session, optype); } /* Reset the cursor so it doesn't block eviction. */ if (cursor != NULL) WT_ERR(cursor->reset(cursor)); return (0); err: __wt_err(session, ret, "operation apply failed during recovery: operation type %" PRIu32 " at LSN %" PRIu32 "/%" PRIu32, optype, lsnp->l.file, lsnp->l.offset); return (ret); }
/* * __wt_txn_checkpoint_log -- * Write a log record for a checkpoint operation. */ int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(logrec); WT_DECL_RET; WT_ITEM *ckpt_snapshot, empty; WT_LSN *ckpt_lsn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; uint8_t *end, *p; size_t recsize; uint32_t i, rectype; const char *fmt; conn = S2C(session); txn_global = &conn->txn_global; txn = &session->txn; ckpt_lsn = &txn->ckpt_lsn; /* * If this is a file sync, log it unless there is a full checkpoint in * progress. */ if (!full) { if (txn->full_ckpt) { if (lsnp != NULL) *lsnp = *ckpt_lsn; return (0); } return (__txn_log_file_sync(session, flags, lsnp)); } switch (flags) { case WT_TXN_LOG_CKPT_PREPARE: txn->full_ckpt = true; if (conn->compat_major >= WT_LOG_V2) { /* * Write the system log record containing a checkpoint * start operation. */ rectype = WT_LOGREC_SYSTEM; fmt = WT_UNCHECKED_STRING(I); WT_ERR(__wt_struct_size( session, &recsize, fmt, rectype)); WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype)); logrec->size += (uint32_t)recsize; WT_ERR(__wt_logop_checkpoint_start_pack( session, logrec)); WT_ERR(__wt_log_write(session, logrec, ckpt_lsn, 0)); } else { WT_ERR(__wt_log_printf(session, "CHECKPOINT: Starting record")); WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true)); } /* * We take and immediately release the visibility lock. * Acquiring the write lock guarantees that any transaction * that has written to the log has also made its transaction * visible at this time. */ __wt_writelock(session, &txn_global->visibility_rwlock); __wt_writeunlock(session, &txn_global->visibility_rwlock); /* * We need to make sure that the log records in the checkpoint * LSN are on disk. In particular to make sure that the * current log file exists. */ WT_ERR(__wt_log_force_sync(session, ckpt_lsn)); break; case WT_TXN_LOG_CKPT_START: /* Take a copy of the transaction snapshot. */ txn->ckpt_nsnapshot = txn->snapshot_count; recsize = (size_t)txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE; WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot)); p = txn->ckpt_snapshot->mem; end = p + recsize; for (i = 0; i < txn->snapshot_count; i++) WT_ERR(__wt_vpack_uint( &p, WT_PTRDIFF(end, p), txn->snapshot[i])); break; case WT_TXN_LOG_CKPT_STOP: /* * During a clean connection close, we get here without the * prepare or start steps. In that case, log the current LSN * as the checkpoint LSN. */ if (!txn->full_ckpt) { txn->ckpt_nsnapshot = 0; WT_CLEAR(empty); ckpt_snapshot = ∅ WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true)); } else ckpt_snapshot = txn->ckpt_snapshot; /* Write the checkpoint log record. */ rectype = WT_LOGREC_CHECKPOINT; fmt = WT_UNCHECKED_STRING(IIIIu); WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset, txn->ckpt_nsnapshot, ckpt_snapshot)); WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset, txn->ckpt_nsnapshot, ckpt_snapshot)); logrec->size += (uint32_t)recsize; WT_ERR(__wt_log_write(session, logrec, lsnp, F_ISSET(conn, WT_CONN_CKPT_SYNC) ? WT_LOG_FSYNC : 0)); /* * If this full checkpoint completed successfully and there is * no hot backup in progress and this is not an unclean * recovery, tell the logging subsystem the checkpoint LSN so * that it can archive. Do not update the logging checkpoint * LSN if this is during a clean connection close, only during * a full checkpoint. A clean close may not update any * metadata LSN and we do not want to archive in that case. */ if (!conn->hot_backup && (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY) || FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) && txn->full_ckpt) __wt_log_ckpt(session, ckpt_lsn); /* FALLTHROUGH */ case WT_TXN_LOG_CKPT_CLEANUP: /* Cleanup any allocated resources */ WT_INIT_LSN(ckpt_lsn); txn->ckpt_nsnapshot = 0; __wt_scr_free(session, &txn->ckpt_snapshot); txn->full_ckpt = false; break; WT_ILLEGAL_VALUE_ERR(session); } err: __wt_logrec_free(session, &logrec); return (ret); }
/* * __wt_btcur_insert -- * Insert a record into the tree. */ int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_FAST_CONN_INCR(session, cursor_insert); WT_STAT_FAST_DATA_INCR(session, cursor_insert); WT_STAT_FAST_DATA_INCRV(session, cursor_insert_bytes, cursor->key.size + cursor->value.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); /* * The tree is no longer empty: eviction should pay attention to it, * and it's no longer possible to bulk-load into it. */ if (btree->bulk_load_ok) { btree->bulk_load_ok = 0; __wt_btree_evictable(session, 1); } retry: WT_RET(__cursor_func_init(cbt, 1)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: /* * If WT_CURSTD_APPEND is set, insert a new record (ignoring * the application's record number). First we search for the * maximum possible record number so the search ends on the * last page. The real record number is assigned by the * serialized append operation. */ if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = UINT64_MAX; WT_ERR(__cursor_col_search(session, cbt)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = 0; /* * If not overwriting, fail if the key exists. Creating a * record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. * Fail in that case, the record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) || (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) WT_ERR(WT_DUPLICATE_KEY); WT_ERR(__cursor_col_modify(session, cbt, 0)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = cbt->recno; break; case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, 1)); /* * If not overwriting, fail if the key exists, else insert the * key/value pair. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && cbt->compare == 0 && __cursor_valid(cbt, NULL)) WT_ERR(WT_DUPLICATE_KEY); ret = __cursor_row_modify(session, cbt, 0); break; WT_ILLEGAL_VALUE_ERR(session); } err: if (ret == WT_RESTART) goto retry; /* Insert doesn't maintain a position across calls, clear resources. */ if (ret == 0) WT_TRET(__curfile_leave(cbt)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_evict_file -- * Discard pages for a specific file. */ int __wt_evict_file(WT_SESSION_IMPL *session, int syncop) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; int eviction_enabled; btree = S2BT(session); eviction_enabled = !F_ISSET(btree, WT_BTREE_NO_EVICTION); /* * We need exclusive access to the file -- disable ordinary eviction * and drain any blocks already queued. */ if (eviction_enabled) WT_RET(__wt_evict_file_exclusive_on(session)); /* Make sure the oldest transaction ID is up-to-date. */ __wt_txn_update_oldest(session); /* Walk the tree, discarding pages. */ next_ref = NULL; WT_ERR(__wt_tree_walk( session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT)); while ((ref = next_ref) != NULL) { page = ref->page; /* * Eviction can fail when a page in the evicted page's subtree * switches state. For example, if we don't evict a page marked * empty, because we expect it to be merged into its parent, it * might no longer be empty after it's reconciled, in which case * eviction of its parent would fail. We can either walk the * tree multiple times (until it's finally empty), or reconcile * each page to get it to its final state before considering if * it's an eviction target or will be merged into its parent. * * Don't limit this test to any particular page type, that tends * to introduce bugs when the reconciliation of other page types * changes, and there's no advantage to doing so. * * Eviction can also fail because an update cannot be written. * If sessions have disjoint sets of files open, updates in a * no-longer-referenced file may not yet be globally visible, * and the write will fail with EBUSY. Our caller handles that * error, retrying later. */ if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING)); /* * We can't evict the page just returned to us (it marks our * place in the tree), so move the walk to one page ahead of * the page being evicted. Note, we reconciled the returned * page first: if reconciliation of that page were to change * the shape of the tree, and we did the next walk call before * the reconciliation, the next walk call could miss a page in * the tree. */ WT_ERR(__wt_tree_walk( session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT)); switch (syncop) { case WT_SYNC_CLOSE: /* * Evict the page. * Do not attempt to evict pages expected to be merged * into their parents, with the exception that the root * page can't be merged, it must be written. */ if (__wt_ref_is_root(ref) || page->modify == NULL || !F_ISSET(page->modify, WT_PM_REC_EMPTY)) WT_ERR(__wt_evict(session, ref, 1)); break; case WT_SYNC_DISCARD: /* * Ordinary discard of the page, whether clean or dirty. * If we see a dirty page in an ordinary discard (e.g., * from sweep), give up: an update must have happened * since the file was selected for sweeping. */ if (__wt_page_is_modified(page)) WT_ERR(EBUSY); /* * If the page contains an update that is too recent to * evict, stop. This should never happen during * connection close, but in other paths our caller * should be prepared to deal with this case. */ if (page->modify != NULL && !__wt_txn_visible_all(session, page->modify->rec_max_txn)) WT_ERR(EBUSY); __wt_evict_page_clean_update(session, ref); break; case WT_SYNC_DISCARD_FORCE: /* * Forced discard of the page, whether clean or dirty. * If we see a dirty page in a forced discard, clean * the page, both to keep statistics correct, and to * let the page-discard function assert no dirty page * is ever discarded. */ if (__wt_page_is_modified(page)) { page->modify->write_gen = 0; __wt_cache_dirty_decr(session, page); } F_SET(session, WT_SESSION_DISCARD_FORCE); __wt_evict_page_clean_update(session, ref); F_CLR(session, WT_SESSION_DISCARD_FORCE); break; WT_ILLEGAL_VALUE_ERR(session); } } if (0) { err: /* On error, clear any left-over tree walk. */ if (next_ref != NULL) WT_TRET(__wt_page_release( session, next_ref, WT_READ_NO_EVICT)); } if (eviction_enabled) __wt_evict_file_exclusive_off(session); return (ret); }
/* * __wt_btcur_update -- * Update a record in the tree. */ int __wt_btcur_update(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_FAST_CONN_INCR(session, cursor_update); WT_STAT_FAST_DATA_INCR(session, cursor_update); WT_STAT_FAST_DATA_INCRV( session, cursor_update_bytes, cursor->value.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); /* * The tree is no longer empty: eviction should pay attention to it, * and it's no longer possible to bulk-load into it. */ if (btree->bulk_load_ok) { btree->bulk_load_ok = 0; __wt_btree_evictable(session, 1); } retry: WT_RET(__cursor_func_init(cbt, 1)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__cursor_col_search(session, cbt)); /* * If not overwriting, fail if the key doesn't exist. Update * the record if it exists. Creating a record past the end of * the tree in a fixed-length column-store implicitly fills the * gap with empty records. Update the record in that case, the * record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); ret = __cursor_col_modify(session, cbt, 0); break; case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, 1)); /* * If not overwriting, fail if the key does not exist. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && (cbt->compare != 0 || !__cursor_valid(cbt, NULL))) WT_ERR(WT_NOTFOUND); ret = __cursor_row_modify(session, cbt, 0); break; WT_ILLEGAL_VALUE_ERR(session); } err: if (ret == WT_RESTART) goto retry; /* * If successful, point the cursor at internal copies of the data. We * could shuffle memory in the cursor so the key/value pair are in local * buffer memory, but that's a data copy. We don't want to do another * search (and we might get a different update structure if we race). * To make this work, we add a field to the btree cursor to pass back a * pointer to the modify function's allocated update structure. */ if (ret == 0) WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_page_inmem -- * Build in-memory page information. */ int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep) { WT_DECL_RET; WT_PAGE *page; const WT_PAGE_HEADER *dsk; uint32_t alloc_entries; size_t size; *pagep = NULL; dsk = image; alloc_entries = 0; /* * Figure out how many underlying objects the page references so we can * allocate them along with the page. */ switch (dsk->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: /* * Column-store leaf page entries map one-to-one to the number * of physical entries on the page (each physical entry is a * value item). * * Column-store internal page entries map one-to-one to the * number of physical entries on the page (each entry is a * location cookie). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_ROW_INT: /* * Row-store internal page entries map one-to-two to the number * of physical entries on the page (each entry is a key and * location cookie pair). */ alloc_entries = dsk->u.entries / 2; break; case WT_PAGE_ROW_LEAF: /* * If the "no empty values" flag is set, row-store leaf page * entries map one-to-one to the number of physical entries * on the page (each physical entry is a key or value item). * If that flag is not set, there are more keys than values, * we have to walk the page to figure it out. */ if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL)) alloc_entries = dsk->u.entries; else if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE)) alloc_entries = dsk->u.entries / 2; else WT_RET(__inmem_row_leaf_entries( session, dsk, &alloc_entries)); break; WT_ILLEGAL_VALUE(session); } /* Allocate and initialize a new WT_PAGE. */ WT_RET(__wt_page_alloc( session, dsk->type, dsk->recno, alloc_entries, 1, &page)); page->dsk = dsk; F_SET_ATOMIC(page, flags); /* * Track the memory allocated to build this page so we can update the * cache statistics in a single call. */ size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0; switch (page->type) { case WT_PAGE_COL_FIX: __inmem_col_fix(session, page); break; case WT_PAGE_COL_INT: __inmem_col_int(session, page); break; case WT_PAGE_COL_VAR: WT_ERR(__inmem_col_var(session, page, &size)); break; case WT_PAGE_ROW_INT: WT_ERR(__inmem_row_int(session, page, &size)); break; case WT_PAGE_ROW_LEAF: WT_ERR(__inmem_row_leaf(session, page)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Update the page's in-memory size and the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); /* Link the new internal page to the parent. */ if (ref != NULL) { switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: page->pg_intl_parent_ref = ref; break; } ref->page = page; } *pagep = page; return (0); err: __wt_page_out(session, &page); return (ret); }
/* * __txn_op_apply -- * Apply a transactional operation during recovery. */ static int __txn_op_apply( WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end) { WT_CURSOR *cursor, *start, *stop; WT_DECL_RET; WT_ITEM key, start_key, stop_key, value; WT_SESSION_IMPL *session; uint64_t recno, start_recno, stop_recno; uint32_t fileid, mode, optype, opsize; session = r->session; cursor = NULL; /* Peek at the size and the type. */ WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize)); end = *pp + opsize; switch (optype) { case WT_LOGOP_COL_PUT: WT_ERR(__wt_logop_col_put_unpack(session, pp, end, &fileid, &recno, &value)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); cursor->set_key(cursor, recno); __wt_cursor_set_raw_value(cursor, &value); WT_ERR(cursor->insert(cursor)); break; case WT_LOGOP_COL_REMOVE: WT_ERR(__wt_logop_col_remove_unpack(session, pp, end, &fileid, &recno)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); cursor->set_key(cursor, recno); WT_ERR(cursor->remove(cursor)); break; case WT_LOGOP_COL_TRUNCATE: WT_ERR(__wt_logop_col_truncate_unpack(session, pp, end, &fileid, &start_recno, &stop_recno)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); /* Set up the cursors. */ if (start_recno == WT_RECNO_OOB) { start = NULL; stop = cursor; } else if (stop_recno == WT_RECNO_OOB) { start = cursor; stop = NULL; } else { start = cursor; WT_ERR(__recovery_cursor( session, r, lsnp, fileid, true, &stop)); } /* Set the keys. */ if (start != NULL) start->set_key(start, start_recno); if (stop != NULL) stop->set_key(stop, stop_recno); WT_TRET(session->iface.truncate(&session->iface, NULL, start, stop, NULL)); /* If we opened a duplicate cursor, close it now. */ if (stop != NULL && stop != cursor) WT_TRET(stop->close(stop)); WT_ERR(ret); break; case WT_LOGOP_ROW_PUT: WT_ERR(__wt_logop_row_put_unpack(session, pp, end, &fileid, &key, &value)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); __wt_cursor_set_raw_key(cursor, &key); __wt_cursor_set_raw_value(cursor, &value); WT_ERR(cursor->insert(cursor)); break; case WT_LOGOP_ROW_REMOVE: WT_ERR(__wt_logop_row_remove_unpack(session, pp, end, &fileid, &key)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); __wt_cursor_set_raw_key(cursor, &key); WT_ERR(cursor->remove(cursor)); break; case WT_LOGOP_ROW_TRUNCATE: WT_ERR(__wt_logop_row_truncate_unpack(session, pp, end, &fileid, &start_key, &stop_key, &mode)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); /* Set up the cursors. */ start = stop = NULL; switch (mode) { case WT_TXN_TRUNC_ALL: /* Both cursors stay NULL. */ break; case WT_TXN_TRUNC_BOTH: start = cursor; WT_ERR(__recovery_cursor( session, r, lsnp, fileid, true, &stop)); break; case WT_TXN_TRUNC_START: start = cursor; break; case WT_TXN_TRUNC_STOP: stop = cursor; break; WT_ILLEGAL_VALUE_ERR(session); } /* Set the keys. */ if (start != NULL) __wt_cursor_set_raw_key(start, &start_key); if (stop != NULL) __wt_cursor_set_raw_key(stop, &stop_key); WT_TRET(session->iface.truncate(&session->iface, NULL, start, stop, NULL)); /* If we opened a duplicate cursor, close it now. */ if (stop != NULL && stop != cursor) WT_TRET(stop->close(stop)); WT_ERR(ret); break; WT_ILLEGAL_VALUE_ERR(session); } /* Reset the cursor so it doesn't block eviction. */ if (cursor != NULL) WT_ERR(cursor->reset(cursor)); err: if (ret != 0) __wt_err(session, ret, "Operation failed during recovery"); return (ret); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, int syncop) { struct timespec end, start; WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, leaf_bytes; uint64_t internal_pages, leaf_pages; uint32_t flags; btree = S2BT(session); flags = WT_READ_CACHE | WT_READ_NO_GEN; walk = NULL; txn = &session->txn; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) WT_RET(__wt_epoch(session, &start)); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* Write dirty pages if nobody beat us to it. */ page = walk->page; if (__wt_page_is_modified(page)) { if (txn->isolation == TXN_ISO_READ_COMMITTED) __wt_txn_refresh(session, 1); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; case WT_SYNC_CHECKPOINT: /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * When internal pages are being reconciled by checkpoint their * child pages cannot disappear from underneath them or be split * into them, nor can underlying blocks be freed until the block * lists for the checkpoint are stable. Set the checkpointing * flag to block eviction of dirty pages until the checkpoint's * internal page pass is complete, then wait for any existing * eviction to complete. */ btree->checkpointing = 1; if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { WT_ERR(__wt_evict_file_exclusive_on(session)); __wt_evict_file_exclusive_off(session); } /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. * * We can skip dirty pages if: * (1) they are leaf pages; * (2) there is a snapshot transaction active (which * is the case in ordinary application checkpoints * but not all internal cases); and * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. */ page = walk->page; mod = page->modify; if (__wt_page_is_modified(page) && (WT_PAGE_IS_INTERNAL(page) || !F_ISSET(txn, TXN_HAS_SNAPSHOT) || TXNID_LE(mod->first_dirty_txn, txn->snap_max))) { if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; WT_ILLEGAL_VALUE_ERR(session); } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_ERR(__wt_epoch(session, &end)); WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 " bytes, %" PRIu64 " pages of internal\n\t" "Took: %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, WT_TIMEDIFF(end, start) / WT_MILLION)); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); if (txn->isolation == TXN_ISO_READ_COMMITTED && session->ncursors == 0) __wt_txn_release_snapshot(session); if (btree->checkpointing) { /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. */ btree->checkpointing = 0; WT_FULL_BARRIER(); /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ WT_TRET(__wt_evict_server_wake(session)); } __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES) WT_RET(btree->bm->sync(btree->bm, session, 1)); return (ret); }