/* * __cursor_key_order_check_col -- * Check key ordering for column-store cursor movements. */ static int __cursor_key_order_check_col( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next) { int cmp; cmp = 0; /* -Werror=maybe-uninitialized */ if (cbt->lastrecno != WT_RECNO_OOB) { if (cbt->lastrecno < cbt->recno) cmp = -1; if (cbt->lastrecno > cbt->recno) cmp = 1; } if (cbt->lastrecno == WT_RECNO_OOB || (next && cmp < 0) || (!next && cmp > 0)) { cbt->lastrecno = cbt->recno; return (0); } WT_PANIC_RET(session, EINVAL, "WT_CURSOR.%s out-of-order returns: returned key %" PRIu64 " then " "key %" PRIu64, next ? "next" : "prev", cbt->lastrecno, cbt->recno); }
/* * __wt_lsm_tree_readunlock -- * Release a shared lock on an LSM tree. */ int __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); if ((ret = __wt_readunlock(session, lsm_tree->rwlock)) != 0) WT_PANIC_RET(session, ret, "Unlocking an LSM tree"); return (0); }
/* * __wt_hazard_clear -- * Clear a hazard pointer. */ int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_HAZARD *hp; btree = S2BT(session); /* If a file can never be evicted, hazard pointers aren't required. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) return (0); /* * Clear the caller's hazard pointer. * The common pattern is LIFO, so do a reverse search. */ for (hp = session->hazard + session->hazard_size - 1; hp >= session->hazard; --hp) if (hp->page == page) { /* * We don't publish the hazard pointer clear in the * general case. It's not required for correctness; * it gives an eviction thread faster access to the * page were the page selected for eviction, but the * generation number was just set, it's unlikely the * page will be selected for eviction. */ hp->page = NULL; /* * If this was the last hazard pointer in the session, * reset the size so that checks can skip this session. */ if (--session->nhazard == 0) WT_PUBLISH(session->hazard_size, 0); return (0); } /* * A serious error, we should always find the hazard pointer. Panic, * because using a page we didn't have pinned down implies corruption. */ WT_PANIC_RET(session, EINVAL, "session %p: clear hazard pointer: %p: not found", (void *)session, (void *)page); }
/* * __wt_lsm_tree_switch -- * Switch to a new in-memory tree. */ int __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; uint32_t nchunks, new_id; int first_switch; WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); nchunks = lsm_tree->nchunks; first_switch = nchunks == 0 ? 1 : 0; /* * Check if a switch is still needed: we may have raced while waiting * for a lock. */ chunk = NULL; if (!first_switch && (chunk = lsm_tree->chunk[nchunks - 1]) != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) goto err; /* Set the switch transaction in the previous chunk, if necessary. */ if (chunk != NULL && chunk->switch_txn == WT_TXN_NONE) chunk->switch_txn = __wt_txn_new_id(session); /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, 0); new_id = WT_ATOMIC_ADD4(lsm_tree->last, 1); WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, nchunks + 1, &lsm_tree->chunk)); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Tree %s switch to: %" PRIu32 ", checkpoint throttle %ld, " "merge throttle %ld", lsm_tree->name, new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle)); WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = new_id; chunk->switch_txn = WT_TXN_NONE; lsm_tree->chunk[lsm_tree->nchunks++] = chunk; WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH); ++lsm_tree->dsk_gen; lsm_tree->modified = 1; err: WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); /* * Errors that happen during a tree switch leave the tree in a state * where we can't make progress. Error out of WiredTiger. */ if (ret != 0) WT_PANIC_RET(session, ret, "Failed doing LSM switch"); else if (!first_switch) WT_RET(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_FLUSH, 0, lsm_tree)); return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t checksum) { WT_BLOCK_HEADER *blk, swap; size_t bufsize; uint32_t page_checksum; __wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", checksum %#" PRIx32, (uintmax_t)offset, size, checksum); WT_STAT_CONN_INCR(session, block_read); WT_STAT_CONN_INCRV(session, block_byte_read, size); /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) bufsize = size; else { F_SET(buf, WT_ITEM_ALIGNED); bufsize = WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, bufsize)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; /* * We incrementally read through the structure before doing a checksum, * do little- to big-endian handling early on, and then select from the * original or swapped structure as needed. */ blk = WT_BLOCK_HEADER_REF(buf->mem); __wt_block_header_byteswap_copy(blk, &swap); if (swap.checksum == checksum) { blk->checksum = 0; page_checksum = __wt_checksum(buf->mem, F_ISSET(&swap, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (page_checksum == checksum) { /* * Swap the page-header as needed; this doesn't belong * here, but it's the best place to catch all callers. */ __wt_page_header_byteswap(buf->mem); return (0); } if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "%s: read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": calculated block checksum " "of %#" PRIx32 " doesn't match expected checksum " "of %#" PRIx32, block->name, size, (uintmax_t)offset, page_checksum, checksum); } else if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "%s: read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": block header checksum " "of %#" PRIx32 " doesn't match expected checksum " "of %#" PRIx32, block->name, size, (uintmax_t)offset, swap.checksum, checksum); if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) WT_IGNORE_RET( __wt_bm_corrupt_dump(session, buf, offset, size, checksum)); /* Panic if a checksum fails during an ordinary read. */ F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); if (block->verify || F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) return (WT_ERROR); WT_PANIC_RET(session, WT_ERROR, "%s: fatal read error", block->name); }
/* * __wt_meta_track_off -- * Turn off metadata operation tracking, unrolling on error. */ int __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll) { WT_DECL_RET; WT_META_TRACK *trk, *trk_orig; WT_SESSION_IMPL *ckpt_session; int saved_ret; bool did_drop; saved_ret = 0; WT_ASSERT(session, WT_META_TRACKING(session) && session->meta_track_nest > 0); trk_orig = session->meta_track; trk = session->meta_track_next; /* If it was a nested transaction, there is nothing to do. */ if (--session->meta_track_nest != 0) return (0); /* Turn off tracking for unroll. */ session->meta_track_next = session->meta_track_sub = NULL; /* * If there were no operations logged, skip unnecessary metadata * checkpoints. For example, this happens if attempting to create a * data source that already exists (or drop one that doesn't). */ if (trk == trk_orig) goto err; /* Unrolling doesn't require syncing the metadata. */ if (unroll) goto err; if (F_ISSET(session, WT_SESSION_SCHEMA_TXN)) { F_CLR(session, WT_SESSION_SCHEMA_TXN); #ifdef WT_ENABLE_SCHEMA_TXN WT_ERR(__wt_txn_commit(session, NULL)); __wt_errx(session, "TRACK: Commit internal schema txn"); #endif } /* * If we don't have the metadata cursor (e.g, we're in the process of * creating the metadata), we can't sync it. */ if (!need_sync || session->meta_cursor == NULL || F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) goto err; /* If we're logging, make sure the metadata update was flushed. */ if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED)) WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), ret = __wt_txn_checkpoint_log( session, false, WT_TXN_LOG_CKPT_SYNC, NULL)); else { WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); ckpt_session = S2C(session)->meta_ckpt_session; /* * If this operation is part of a running transaction, that * should be included in the checkpoint. */ ckpt_session->txn.id = session->txn.id; WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_METADATA)); WT_WITH_DHANDLE(ckpt_session, WT_SESSION_META_DHANDLE(session), WT_WITH_METADATA_LOCK(ckpt_session, ret = __wt_checkpoint(ckpt_session, NULL))); ckpt_session->txn.id = WT_TXN_NONE; if (ret == 0) WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), ret = __wt_checkpoint_sync(session, NULL)); } err: /* * Undo any tracked operations on failure. * Apply any tracked operations post-commit. */ did_drop = false; if (unroll || ret != 0) { saved_ret = ret; ret = 0; while (--trk >= trk_orig) { did_drop = did_drop || trk->op == WT_ST_DROP_COMMIT; WT_TRET(__meta_track_unroll(session, trk)); } } else for (; trk_orig < trk; trk_orig++) { did_drop = did_drop || trk_orig->op == WT_ST_DROP_COMMIT; WT_TRET(__meta_track_apply(session, trk_orig)); } if (F_ISSET(session, WT_SESSION_SCHEMA_TXN)) { F_CLR(session, WT_SESSION_SCHEMA_TXN); /* * We should have committed above unless we're unrolling, there * was an error or the operation was a noop. */ WT_ASSERT(session, unroll || saved_ret != 0 || session->txn.mod_count == 0); #ifdef WT_ENABLE_SCHEMA_TXN __wt_err(session, saved_ret, "TRACK: Abort internal schema txn"); WT_TRET(__wt_txn_rollback(session, NULL)); #endif } /* * Wake up the sweep thread: particularly for the in-memory * storage engine, we want to reclaim space immediately. */ if (did_drop && S2C(session)->sweep_cond != NULL) __wt_cond_signal(session, S2C(session)->sweep_cond); if (ret != 0) WT_PANIC_RET(session, ret, "failed to apply or unroll all tracked operations"); return (saved_ret == 0 ? 0 : saved_ret); }
/* * __thread_group_resize -- * Resize an array of utility threads already holding the lock. */ static int __thread_group_resize( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, uint32_t new_min, uint32_t new_max, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_THREAD *thread; size_t alloc; uint32_t i, session_flags; conn = S2C(session); session_flags = 0; WT_ASSERT(session, group->current_threads <= group->alloc && __wt_rwlock_islocked(session, group->lock)); if (new_min == group->min && new_max == group->max) return (0); /* * Coll shrink to reduce the number of thread structures and running * threads if required by the change in group size. */ WT_RET(__thread_group_shrink(session, group, new_max)); /* * Only reallocate the thread array if it is the largest ever, since * our realloc doesn't support shrinking the allocated size. */ if (group->alloc < new_max) { alloc = group->alloc * sizeof(*group->threads); WT_RET(__wt_realloc(session, &alloc, new_max * sizeof(*group->threads), &group->threads)); group->alloc = new_max; } /* * Initialize the structures based on the previous group size, not * the previous allocated size. */ for (i = group->max; i < new_max; i++) { WT_ERR(__wt_calloc_one(session, &thread)); /* * Threads get their own session and lookaside table cursor * if the lookaside table is open. Note that threads are * started during recovery, before the lookaside table is * created. */ if (LF_ISSET(WT_THREAD_CAN_WAIT)) session_flags = WT_SESSION_CAN_WAIT; if (F_ISSET(conn, WT_CONN_LAS_OPEN)) FLD_SET(session_flags, WT_SESSION_LOOKASIDE_CURSOR); WT_ERR(__wt_open_internal_session(conn, group->name, false, session_flags, &thread->session)); if (LF_ISSET(WT_THREAD_PANIC_FAIL)) F_SET(thread, WT_THREAD_PANIC_FAIL); thread->id = i; thread->run_func = group->run_func; WT_ASSERT(session, group->threads[i] == NULL); group->threads[i] = thread; } if (group->current_threads < new_min) WT_ERR(__thread_group_grow(session, group, new_min)); err: /* * Update the thread group information even on failure to improve our * chances of cleaning up properly. */ group->max = new_max; group->min = new_min; /* * An error resizing a thread array is fatal, it should only happen * in an out of memory situation. */ if (ret != 0) { WT_TRET(__wt_thread_group_destroy(session, group)); WT_PANIC_RET(session, ret, "Error while resizing thread group"); } return (ret); }