/* * __wt_async_op_enqueue -- * Enqueue an operation onto the work queue. */ int __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op) { WT_ASYNC *async; WT_CONNECTION_IMPL *conn; WT_DECL_RET; uint64_t cur_head, cur_tail, my_alloc, my_slot; #ifdef HAVE_DIAGNOSTIC WT_ASYNC_OP_IMPL *my_op; #endif conn = S2C(session); async = conn->async; /* * If an application re-uses a WT_ASYNC_OP, we end up here with an * invalid object. */ if (op->state != WT_ASYNCOP_READY) WT_RET_MSG(session, EINVAL, "application error: WT_ASYNC_OP already in use"); /* * Enqueue op at the tail of the work queue. * We get our slot in the ring buffer to use. */ my_alloc = WT_ATOMIC_ADD8(async->alloc_head, 1); my_slot = my_alloc % async->async_qsize; /* * Make sure we haven't wrapped around the queue. * If so, wait for the tail to advance off this slot. */ WT_ORDERED_READ(cur_tail, async->tail_slot); while (cur_tail == my_slot) { __wt_yield(); WT_ORDERED_READ(cur_tail, async->tail_slot); } #ifdef HAVE_DIAGNOSTIC WT_ORDERED_READ(my_op, async->async_queue[my_slot]); if (my_op != NULL) return (__wt_panic(session)); #endif WT_PUBLISH(async->async_queue[my_slot], op); op->state = WT_ASYNCOP_ENQUEUED; if (WT_ATOMIC_ADD4(async->cur_queue, 1) > async->max_queue) WT_PUBLISH(async->max_queue, async->cur_queue); /* * Multiple threads may be adding ops to the queue. We need to wait * our turn to make our slot visible to workers. */ WT_ORDERED_READ(cur_head, async->head); while (cur_head != (my_alloc - 1)) { __wt_yield(); WT_ORDERED_READ(cur_head, async->head); } WT_PUBLISH(async->head, my_alloc); return (ret); }
/* * __rec_page_dirty_update -- * Update a dirty page's reference on eviction. */ static int __rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_ADDR *addr; WT_PAGE_MODIFY *mod; WT_REF *parent_ref; mod = page->modify; parent_ref = page->ref; switch (F_ISSET(mod, WT_PM_REC_MASK)) { case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ if (parent_ref->addr != NULL && __wt_off_page(page->parent, parent_ref->addr)) { __wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr); __wt_free(session, parent_ref->addr); } /* * Update the parent to reference the replacement page. * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); *addr = mod->u.replace; mod->u.replace.addr = NULL; mod->u.replace.size = 0; parent_ref->page = NULL; parent_ref->addr = addr; WT_PUBLISH(parent_ref->state, WT_REF_DISK); break; case WT_PM_REC_SPLIT: /* Page split */ /* * Update the parent to reference new internal page(s). * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ parent_ref->page = mod->u.split; WT_PUBLISH(parent_ref->state, WT_REF_MEM); /* Clear the reference else discarding the page will free it. */ mod->u.split = NULL; F_CLR(mod, WT_PM_REC_SPLIT); break; case WT_PM_REC_EMPTY: /* Page is empty */ /* We checked if the page was empty when we reviewed it. */ /* FALLTHROUGH */ WT_ILLEGAL_VALUE(session); } return (0); }
/* * __spin_lock_next_id -- * Return the next spinlock caller ID. */ static int __spin_lock_next_id(WT_SESSION_IMPL *session, int *idp) { static int lock_id = 0, next_id = 0; WT_DECL_RET; /* If we've ever registered this location, we already have an ID. */ if (*idp != WT_SPINLOCK_REGISTER) return (0); /* * We can't use the global spinlock to lock the ID allocation (duh!), * use a CAS instruction to serialize access to a local variable. * This work only gets done once per library instantiation, there * isn't a performance concern. */ while (!WT_ATOMIC_CAS(lock_id, 0, 1)) __wt_yield(); /* Allocate a blocking ID for this location. */ if (*idp == WT_SPINLOCK_REGISTER) { if (next_id < WT_SPINLOCK_MAX_LOCATION_ID) *idp = next_id++; else WT_ERR_MSG(session, ENOMEM, "spinlock caller location registry failed, " "increase the connection's blocking matrix size"); } err: WT_PUBLISH(lock_id, 0); return (ret); }
/* * __wt_delete_page_skip -- * If iterating a cursor, skip deleted pages that are visible to us. */ bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) { bool skip; /* * Deleted pages come from two sources: either it's a fast-delete as * described above, or the page has been emptied by other operations * and eviction deleted it. * * In both cases, the WT_REF state will be WT_REF_DELETED. In the case * of a fast-delete page, there will be a WT_PAGE_DELETED structure with * the transaction ID of the transaction that deleted the page, and the * page is visible if that transaction ID is visible. In the case of an * empty page, there will be no WT_PAGE_DELETED structure and the delete * is by definition visible, eviction could not have deleted the page if * there were changes on it that were not globally visible. * * We're here because we found a WT_REF state set to WT_REF_DELETED. It * is possible the page is being read into memory right now, though, and * the page could switch to an in-memory state at any time. Lock down * the structure, just to be safe. */ if (ref->page_del == NULL) return (true); if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) return (false); skip = (ref->page_del == NULL || __wt_txn_visible(session, ref->page_del->txnid)); WT_PUBLISH(ref->state, WT_REF_DELETED); return (skip); }
/* * __wt_txn_release -- * Release the resources associated with the current transaction. */ void __wt_txn_release(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; txn = &session->txn; txn->mod_count = 0; txn->notify = NULL; txn_global = &S2C(session)->txn_global; txn_state = &txn_global->states[session->id]; /* Clear the transaction's ID from the global table. */ WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); txn->id = WT_TXN_NONE; /* * Reset the transaction state to not running. * * Auto-commit transactions (identified by having active cursors) * handle this at a higher level. */ if (session->ncursors == 0) __wt_txn_release_snapshot(session); txn->isolation = session->isolation; F_CLR(txn, TXN_ERROR | TXN_OLDEST | TXN_RUNNING); }
/* * __wt_log_slot_activate -- * Initialize a slot to become active. */ void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; WT_LOG *log; conn = S2C(session); log = conn->log; /* * !!! slot_release_lsn must be set outside this function because * this function may be called after a log file switch and the * slot_release_lsn must refer to the end of the previous log. * !!! We cannot initialize flags here because it may already be * set for closing the file handle on a log file switch. The flags * are reset when the slot is freed. See log_slot_free. */ slot->slot_unbuffered = 0; slot->slot_start_lsn = slot->slot_end_lsn = log->alloc_lsn; slot->slot_start_offset = log->alloc_lsn.l.offset; slot->slot_last_offset = log->alloc_lsn.l.offset; slot->slot_fh = log->log_fh; slot->slot_error = 0; WT_DIAGNOSTIC_YIELD; /* * Set the slot state last. Other threads may have a stale pointer * to this slot and could try to alter the state and other fields once * they see the state cleared. */ WT_PUBLISH(slot->slot_state, 0); }
/* * __wt_posix_file_fallocate -- * POSIX fallocate. */ int __wt_posix_file_fallocate(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t offset, wt_off_t len) { /* * The first fallocate call: figure out what fallocate call this system * supports, if any. * * The function is configured as a locking fallocate call, so we know * we're single-threaded through here. Set the nolock function first, * then publish the NULL replacement to ensure the handle functions are * always correct. * * We've seen Linux systems where posix_fallocate has corrupted * existing file data (even though that is explicitly disallowed * by POSIX). FreeBSD and Solaris support posix_fallocate, and * so far we've seen no problems leaving it unlocked. Check for * fallocate (and the system call version of fallocate) first to * avoid locking on Linux if at all possible. */ if (__posix_std_fallocate(file_handle, wt_session, offset, len) == 0) { file_handle->fh_allocate_nolock = __posix_std_fallocate; WT_PUBLISH(file_handle->fh_allocate, NULL); return (0); } if (__posix_sys_fallocate(file_handle, wt_session, offset, len) == 0) { file_handle->fh_allocate_nolock = __posix_sys_fallocate; WT_PUBLISH(file_handle->fh_allocate, NULL); return (0); } if (__posix_posix_fallocate( file_handle, wt_session, offset, len) == 0) { #if defined(__linux__) file_handle->fh_allocate = __posix_posix_fallocate; WT_WRITE_BARRIER(); #else file_handle->fh_allocate_nolock = __posix_posix_fallocate; WT_PUBLISH(file_handle->fh_allocate, NULL); #endif return (0); } file_handle->fh_allocate = NULL; WT_WRITE_BARRIER(); return (ENOTSUP); }
/* * __rec_page_clean_update -- * Update a clean page's reference on eviction. */ static void __rec_page_clean_update(WT_SESSION_IMPL *session, WT_PAGE *page) { /* Update the relevant WT_REF structure. */ page->ref->page = NULL; WT_PUBLISH(page->ref->state, WT_REF_DISK); WT_UNUSED(session); }
/* * __wt_txn_release -- * Release the resources associated with the current transaction. */ void __wt_txn_release(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; txn = &session->txn; txn_global = &S2C(session)->txn_global; txn_state = WT_SESSION_TXN_STATE(session); WT_ASSERT(session, txn->mod_count == 0); txn->notify = NULL; /* Clear the transaction's ID from the global table. */ if (WT_SESSION_IS_CHECKPOINT(session)) { WT_ASSERT(session, txn_state->id == WT_TXN_NONE); txn->id = txn_global->checkpoint_state.id = txn_global->checkpoint_state.pinned_id = WT_TXN_NONE; /* * Be extra careful to cleanup everything for checkpoints: once * the global checkpoint ID is cleared, we can no longer tell * if this session is doing a checkpoint. */ txn_global->checkpoint_id = 0; } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running)); WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); txn->id = WT_TXN_NONE; } __wt_txn_clear_commit_timestamp(session); __wt_txn_clear_read_timestamp(session); /* Free the scratch buffer allocated for logging. */ __wt_logrec_free(session, &txn->logrec); /* Discard any memory from the session's stash that we can. */ WT_ASSERT(session, __wt_session_gen(session, WT_GEN_SPLIT) == 0); __wt_stash_discard(session); /* * Reset the transaction state to not running and release the snapshot. */ __wt_txn_release_snapshot(session); txn->isolation = session->isolation; /* Ensure the transaction flags are cleared on exit */ txn->flags = 0; }
/* * __wt_bt_cache_op -- * Cache operations: compaction, discard, sync/checkpoint. */ int __wt_bt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op) { WT_DECL_RET; WT_BTREE *btree; btree = session->btree; /* * Compaction and sync/checkpoint reconcile dirty pages from the cache * to the backing block manager. Reconciliation is just another reader * of the page, so with some care, it can be done in the current thread, * leaving the eviction thread to keep freeing spaces if the cache is * full. Sync and eviction cannot operate on the same page at the same * time, and there are different modes inside __wt_tree_walk to make * sure they don't trip over each other. * * The current thread cannot evict pages from the cache, so discard is * done by calling the eviction server for service. * * XXX * Set the checkpoint reference for reconciliation -- this is ugly, but * there's no data structure path from here to reconciliation. * * Publish: there must be a barrier to ensure the structure fields are * set before the eviction thread can see the request. */ WT_PUBLISH(btree->ckpt, ckptbase); switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_COMPACT: case WT_SYNC_WRITE_LEAVES: WT_ERR(__wt_sync_file(session, op)); break; case WT_SYNC_DISCARD: case WT_SYNC_DISCARD_NOWRITE: /* * Schedule and wake the eviction server, then wait for the * eviction server to wake us. */ WT_ERR(__wt_sync_file_serial(session, op)); WT_ERR(__wt_evict_server_wake(session)); WT_ERR(__wt_cond_wait(session, session->cond, 0)); ret = session->syncop_ret; /* If discarding the tree, the root page should be gone. */ WT_ASSERT(session, ret != 0 || btree->root_page == NULL); break; WT_ILLEGAL_VALUE_ERR(session); } err: btree->ckpt = NULL; return (ret); }
/* * __wt_delete_page_skip -- * If iterating a cursor, skip deleted pages that are either visible to * us or globally visible. */ bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) { bool skip; /* * Deleted pages come from two sources: either it's a fast-delete as * described above, or the page has been emptied by other operations * and eviction deleted it. * * In both cases, the WT_REF state will be WT_REF_DELETED. In the case * of a fast-delete page, there will be a WT_PAGE_DELETED structure with * the transaction ID of the transaction that deleted the page, and the * page is visible if that transaction ID is visible. In the case of an * empty page, there will be no WT_PAGE_DELETED structure and the delete * is by definition visible, eviction could not have deleted the page if * there were changes on it that were not globally visible. * * We're here because we found a WT_REF state set to WT_REF_DELETED. It * is possible the page is being read into memory right now, though, and * the page could switch to an in-memory state at any time. Lock down * the structure, just to be safe. */ if (ref->page_del == NULL) return (true); if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) return (false); skip = ref->page_del == NULL || (visible_all ? __wt_txn_visible_all(session, ref->page_del->txnid, WT_TIMESTAMP_NULL(&ref->page_del->timestamp)): __wt_txn_visible(session, ref->page_del->txnid, WT_TIMESTAMP_NULL(&ref->page_del->timestamp))); /* * The page_del structure can be freed as soon as the delete is stable: * it is only read when the ref state is WT_REF_DELETED. It is worth * checking every time we come through because once this is freed, we * no longer need synchronization to check the ref. */ if (skip && ref->page_del != NULL && (visible_all || __wt_txn_visible_all(session, ref->page_del->txnid, WT_TIMESTAMP_NULL(&ref->page_del->timestamp)))) { __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); } WT_PUBLISH(ref->state, WT_REF_DELETED); return (skip); }
/* * __merge_unlock -- * Unlock all pages under an internal page being merged. */ static void __merge_unlock(WT_PAGE *page) { WT_REF *ref; uint32_t i; WT_REF_FOREACH(page, ref, i) if (ref->state == WT_REF_LOCKED) { if (ref->page->type == WT_PAGE_ROW_INT || ref->page->type == WT_PAGE_COL_INT) __merge_unlock(ref->page); WT_PUBLISH(ref->state, WT_REF_MEM); } }
/* * __rec_page_clean_update -- * Update a clean page's reference on eviction. */ static void __rec_page_clean_update(WT_SESSION_IMPL *session, WT_REF *parent_ref) { /* * Update the WT_REF structure in the parent. If the page has an * address, it's a disk page; if it has no address, it must be a * deleted page that was re-instantiated (for example, by searching) * and never written. */ parent_ref->page = NULL; WT_PUBLISH(parent_ref->state, parent_ref->addr == NULL ? WT_REF_DELETED : WT_REF_DISK); WT_UNUSED(session); }
/* * __wt_txn_release -- * Release the resources associated with the current transaction. */ void __wt_txn_release(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; txn = &session->txn; WT_ASSERT(session, txn->mod_count == 0); txn->notify = NULL; txn_global = &S2C(session)->txn_global; txn_state = WT_SESSION_TXN_STATE(session); /* Clear the transaction's ID from the global table. */ if (WT_SESSION_IS_CHECKPOINT(session)) { WT_ASSERT(session, txn_state->id == WT_TXN_NONE); txn->id = WT_TXN_NONE; /* Clear the global checkpoint transaction IDs. */ txn_global->checkpoint_id = 0; txn_global->checkpoint_pinned = WT_TXN_NONE; } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running)); WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); txn->id = WT_TXN_NONE; } /* Free the scratch buffer allocated for logging. */ __wt_logrec_free(session, &txn->logrec); /* Discard any memory from the session's split stash that we can. */ WT_ASSERT(session, session->split_gen == 0); if (session->split_stash_cnt > 0) __wt_split_stash_discard(session); /* * Reset the transaction state to not running and release the snapshot. */ __wt_txn_release_snapshot(session); txn->isolation = session->isolation; /* Ensure the transaction flags are cleared on exit */ txn->flags = 0; }
/* * __wt_hazard_clear -- * Clear a hazard pointer. */ int __wt_hazard_clear(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_HAZARD *hp; btree = S2BT(session); /* If a file can never be evicted, hazard pointers aren't required. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) return (0); /* * Clear the caller's hazard pointer. * The common pattern is LIFO, so do a reverse search. */ for (hp = session->hazard + session->hazard_size - 1; hp >= session->hazard; --hp) if (hp->page == page) { /* * We don't publish the hazard pointer clear in the * general case. It's not required for correctness; * it gives an eviction thread faster access to the * page were the page selected for eviction, but the * generation number was just set, it's unlikely the * page will be selected for eviction. */ hp->page = NULL; /* * If this was the last hazard pointer in the session, * reset the size so that checks can skip this session. */ if (--session->nhazard == 0) WT_PUBLISH(session->hazard_size, 0); return (0); } /* * A serious error, we should always find the hazard pointer. Panic, * because using a page we didn't have pinned down implies corruption. */ WT_PANIC_RET(session, EINVAL, "session %p: clear hazard pointer: %p: not found", (void *)session, (void *)page); }
/* * __rec_page_clean_update -- * Update a clean page's reference on eviction. */ static void __rec_page_clean_update(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_REF *ref; ref = page->ref; /* * Update the page's WT_REF structure. If the page has an address, it's * a disk page; if it has no address, it must be a deleted page that was * re-instantiated (for example, by searching) and never written. */ ref->page = NULL; WT_PUBLISH(ref->state, ref->addr == NULL ? WT_REF_DELETED : WT_REF_DISK); WT_UNUSED(session); }
/* * __wt_update_serial_func -- * Server function to add an WT_UPDATE entry in the page array. */ int __wt_update_serial_func(WT_SESSION_IMPL *session, void *args) { WT_PAGE *page; WT_UPDATE **new_upd, *upd, **upd_entry, **upd_obsolete; uint32_t write_gen; __wt_update_unpack( args, &page, &write_gen, &upd_entry, &new_upd, &upd, &upd_obsolete); /* Check the page's write-generation. */ WT_RET(__wt_page_write_gen_check(session, page, write_gen)); upd->next = *upd_entry; /* * Publish: there must be a barrier to ensure the new entry's next * pointer is set before we update the linked list. */ WT_PUBLISH(*upd_entry, upd); __wt_update_upd_taken(session, args, page); /* * If the page needs an update array (column-store pages and inserts on * row-store pages do not use the update array), our caller passed us * one of the correct size. Check the page still needs one (the write * generation test should have caught that, though). * * NOTE: it is important to do this after publishing that the update is * set. Code can assume that if the array is set, it is non-empty. */ if (new_upd != NULL && page->u.row.upd == NULL) { page->u.row.upd = new_upd; __wt_update_new_upd_taken(session, args, page); } /* Discard obsolete WT_UPDATE structures. */ *upd_obsolete = __wt_update_obsolete_check(session, upd->next); __wt_page_and_tree_modify_set(session, page); return (0); }
/* * __wt_insert_serial_func -- * Server function to add an WT_INSERT entry to the page. */ int __wt_insert_serial_func(WT_SESSION_IMPL *session, void *args) { WT_INSERT *new_ins, ***ins_stack; WT_INSERT_HEAD *inshead, **insheadp, **new_inslist, *new_inshead; WT_PAGE *page; uint32_t write_gen; u_int i, skipdepth; __wt_insert_unpack(args, &page, &write_gen, &insheadp, &ins_stack, &new_inslist, &new_inshead, &new_ins, &skipdepth); /* Check the page's write-generation. */ WT_RET(__wt_page_write_gen_check(session, page, write_gen)); /* * Publish: First, point the new WT_INSERT item's skiplist references * to the next elements in the insert list, then flush memory. Second, * update the skiplist elements that reference the new WT_INSERT item, * this ensures the list is never inconsistent. */ if ((inshead = *insheadp) == NULL) inshead = new_inshead; for (i = 0; i < skipdepth; i++) new_ins->next[i] = *ins_stack[i]; WT_WRITE_BARRIER(); for (i = 0; i < skipdepth; i++) { if (inshead->tail[i] == NULL || ins_stack[i] == &inshead->tail[i]->next[i]) inshead->tail[i] = new_ins; *ins_stack[i] = new_ins; } __wt_insert_new_ins_taken(session, args, page); /* * If the insert head does not yet have an insert list, our caller * passed us one. * * NOTE: it is important to do this after the item has been added to * the list. Code can assume that if the list is set, it is non-empty. */ if (*insheadp == NULL) { WT_PUBLISH(*insheadp, new_inshead); __wt_insert_new_inshead_taken(session, args, page); } /* * If the page does not yet have an insert array, our caller passed * us one. * * NOTE: it is important to do this after publishing the list entry. * Code can assume that if the array is set, it is non-empty. */ if (page->type == WT_PAGE_ROW_LEAF) { if (page->u.row.ins == NULL) { page->u.row.ins = new_inslist; __wt_insert_new_inslist_taken(session, args, page); } } else if (page->modify->update == NULL) { page->modify->update = new_inslist; __wt_insert_new_inslist_taken(session, args, page); } __wt_page_and_tree_modify_set(session, page); return (0); }
/* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. */ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; bool flush_set; flush_set = false; /* * If the chunk is already checkpointed, make sure it is also evicted. * Either way, there is no point trying to checkpoint it again. */ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { WT_WITH_HANDLE_LIST_LOCK(session, ret = __lsm_discard_handle(session, chunk->uri, NULL)); if (ret == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; else WT_RET_MSG(session, ret, "discard handle"); } if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s already on disk", chunk->uri)); return (0); } /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session, true); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri)); return (0); } if (!__wt_atomic_cas8(&chunk->flushing, 0, 1)) return (0); flush_set = true; WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", chunk->uri)); /* * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * * !!! * We can wait here for checkpoints and fsyncs to complete, which can * take a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { /* * Set read-uncommitted: we have already checked that all of the * updates in this chunk are globally visible, use the cheapest * possible check in reconciliation. */ saved_isolation = session->txn.isolation; session->txn.isolation = WT_ISO_READ_UNCOMMITTED; ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } WT_ERR(ret); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri)); /* * Turn on metadata tracking to ensure the checkpoint gets the * necessary handle locks. * * Ensure that we don't race with a running checkpoint: the checkpoint * lock protects against us racing with an application checkpoint in * this chunk. Don't wait for it, though: checkpoints can take a long * time, and our checkpoint operation should be very quick. */ WT_ERR(__wt_meta_track_on(session)); WT_WITH_CHECKPOINT_LOCK(session, ret, WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_schema_worker( session, chunk->uri, __wt_checkpoint, NULL, NULL, 0))); WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM checkpoint"); /* Now the file is written, get the chunk size. */ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts)); ++lsm_tree->chunks_flushed; /* Lock the tree, mark the chunk as on disk and update the metadata. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, true); WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM metadata write"); WT_PUBLISH(chunk->flushing, 0); flush_set = false; /* * Clear the no-eviction flag so the primary can be evicted and * eventually closed. Only do this once the checkpoint has succeeded: * otherwise, accessing the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, true); WT_ERR(__wt_session_release_btree(session)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", chunk->uri)); /* Schedule a bloom filter create for our newly flushed chunk. */ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); else WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); err: if (flush_set) WT_PUBLISH(chunk->flushing, 0); return (ret); }
/* * __wt_delete_page -- * If deleting a range, try to delete the page without instantiating it. */ int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_DECL_RET; WT_PAGE *parent; *skipp = false; /* If we have a clean page in memory, attempt to evict it. */ if (ref->state == WT_REF_MEM && __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { WT_PUBLISH(ref->state, WT_REF_MEM); return (0); } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); ret = __wt_evict_page(session, ref); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); } /* * Atomically switch the page's state to lock it. If the page is not * on-disk, other threads may be using it, no fast delete. * * Possible optimization: if the page is already deleted and the delete * is visible to us (the delete has been committed), we could skip the * page instead of instantiating it and figuring out there are no rows * in the page. While that's a huge amount of work to no purpose, it's * unclear optimizing for overlapping range deletes is worth the effort. */ if (ref->state != WT_REF_DISK || !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* * We cannot fast-delete pages that have overflow key/value items as * the overflow blocks have to be discarded. The way we figure that * out is to check the on-page cell type for the page, cells for leaf * pages that have no overflow items are special. * * In some cases, the reference address may not reference an on-page * cell (for example, some combination of page splits), in which case * we can't check the original cell value and we fail. * * To look at an on-page cell, we need to look at the parent page, and * that's dangerous, our parent page could change without warning if * the parent page were to split, deepening the tree. It's safe: the * page's reference will always point to some valid page, and if we find * any problems we simply fail the fast-delete optimization. * * !!! * I doubt it's worth the effort, but we could copy the cell's type into * the reference structure, and then we wouldn't need an on-page cell. */ parent = ref->home; if (__wt_off_page(parent, ref->addr) || __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO) goto err; /* * This action dirties the parent page: mark it dirty now, there's no * future reconciliation of the child leaf page that will dirty it as * we write the tree. */ WT_ERR(__wt_page_parent_modify_set(session, ref, false)); /* * Record the change in the transaction structure and set the change's * transaction ID. */ WT_ERR(__wt_calloc_one(session, &ref->page_del)); ref->page_del->txnid = session->txn.id; WT_ERR(__wt_txn_modify_ref(session, ref)); *skipp = true; WT_PUBLISH(ref->state, WT_REF_DELETED); return (0); err: __wt_free(session, ref->page_del); /* * Restore the page to on-disk status, we'll have to instantiate it. */ WT_PUBLISH(ref->state, WT_REF_DISK); return (ret); }
/* * __wt_cache_read -- * Read a page from the file. */ int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) { WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; WT_PAGE_STATE previous_state; size_t addr_size; const uint8_t *addr; page = NULL; /* * Don't pass an allocated buffer to the underlying block read function, * force allocation of new memory of the appropriate size. */ WT_CLEAR(tmp); /* * Attempt to set the state to WT_REF_READING for normal reads, or * WT_REF_LOCKED, for deleted pages. If successful, we've won the * race, read the page. */ if (WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_READING)) previous_state = WT_REF_DISK; else if (WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED)) previous_state = WT_REF_DELETED; else return (0); /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. * Otherwise, there's an address, read the backing disk page and build * an in-memory version of the page. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; } else { /* Read the backing disk page. */ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); /* Build the in-memory version of the page. */ WT_ERR(__wt_page_inmem(session, ref, tmp.data, WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); /* If the page was deleted, instantiate that information. */ if (previous_state == WT_REF_DELETED) WT_ERR(__wt_delete_page_instantiate(session, ref)); } WT_ERR(__wt_verbose(session, WT_VERB_READ, "page %p: %s", page, __wt_page_type_string(page->type))); WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* * If the function building an in-memory version of the page failed, * it discarded the page, but not the disk image. Discard the page * and separately discard the disk image in all cases. */ if (ref->page != NULL) __wt_ref_out(session, ref); WT_PUBLISH(ref->state, previous_state); __wt_buf_free(session, &tmp); return (ret); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, int syncop) { struct timespec end, start; WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, leaf_bytes; uint64_t internal_pages, leaf_pages; uint32_t flags; bool evict_reset; btree = S2BT(session); flags = WT_READ_CACHE | WT_READ_NO_GEN; walk = NULL; txn = &session->txn; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) WT_RET(__wt_epoch(session, &start)); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write the hottest pages: checkpoint will have * to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && __wt_txn_visible_all( session, page->modify->update_txn)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; case WT_SYNC_CHECKPOINT: /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * When internal pages are being reconciled by checkpoint their * child pages cannot disappear from underneath them or be split * into them, nor can underlying blocks be freed until the block * lists for the checkpoint are stable. Set the checkpointing * flag to block eviction of dirty pages until the checkpoint's * internal page pass is complete, then wait for any existing * eviction to complete. */ btree->checkpointing = 1; WT_FULL_BARRIER(); WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); if (evict_reset) __wt_evict_file_exclusive_off(session); /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { /* * If we have a page, and it was ever modified, track * the highest transaction ID in the tree. We do this * here because we want the value after reconciling * dirty pages. */ if (walk != NULL && walk->page != NULL && (mod = walk->page->modify) != NULL && WT_TXNID_LT(btree->rec_max_txn, mod->rec_max_txn)) btree->rec_max_txn = mod->rec_max_txn; WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); if (walk == NULL) break; page = walk->page; mod = page->modify; /* Skip clean pages. */ if (!__wt_page_is_modified(page)) continue; /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. * * We can skip dirty pages if: * (1) they are leaf pages; * (2) there is a snapshot transaction active (which * is the case in ordinary application checkpoints * but not all internal cases); and * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. * * Mark the tree dirty: the checkpoint marked it clean * and we can't skip future checkpoints until this page * is written. */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn) && mod->rec_result != WT_PM_REC_REWRITE) { __wt_page_modify_set(session, page); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } break; } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_ERR(__wt_epoch(session, &end)); WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 " bytes, %" PRIu64 " pages of internal\n\t" "Took: %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, WT_TIMEDIFF(end, start) / WT_MILLION)); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); if (txn->isolation == WT_ISO_READ_COMMITTED && session->ncursors == 0) __wt_txn_release_snapshot(session); if (btree->checkpointing) { /* * Update the checkpoint generation for this handle so visible * updates newer than the checkpoint can be evicted. * * This has to be published before eviction is enabled again, * so that eviction knows that the checkpoint has completed. */ WT_PUBLISH(btree->checkpoint_gen, S2C(session)->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. */ btree->checkpointing = 0; WT_FULL_BARRIER(); /* * If this tree was being skipped by the eviction server during * the checkpoint, clear the wait. */ btree->evict_walk_period = 0; /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ WT_TRET(__wt_evict_server_wake(session)); } __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES) WT_RET(btree->bm->sync(btree->bm, session, true)); return (ret); }
/* * __wt_configure_method -- * WT_CONNECTION.configure_method. */ int __wt_configure_method(WT_SESSION_IMPL *session, const char *method, const char *uri, const char *config, const char *type, const char *check) { const WT_CONFIG_CHECK *cp; WT_CONFIG_CHECK *checks, *newcheck; const WT_CONFIG_ENTRY **epp; WT_CONFIG_ENTRY *entry; WT_CONNECTION_IMPL *conn; WT_DECL_RET; size_t cnt; char *newcheck_name, *p; /* * !!! * We ignore the specified uri, that is, all new configuration options * will be valid for all data sources. That shouldn't be too bad * as the worst that can happen is an application might specify some * configuration option and not get an error -- the option should be * ignored by the underlying implementation since it's unexpected, so * there shouldn't be any real problems. Eventually I expect we will * get the whole data-source thing sorted, at which time there may be * configuration arrays for each data source, and that's when the uri * will matter. */ WT_UNUSED(uri); conn = S2C(session); checks = newcheck = NULL; entry = NULL; newcheck_name = NULL; /* Argument checking; we only support a limited number of types. */ if (config == NULL) WT_RET_MSG(session, EINVAL, "no configuration specified"); if (type == NULL) WT_RET_MSG(session, EINVAL, "no configuration type specified"); if (strcmp(type, "boolean") != 0 && strcmp(type, "int") != 0 && strcmp(type, "list") != 0 && strcmp(type, "string") != 0) WT_RET_MSG(session, EINVAL, "type must be one of \"boolean\", \"int\", \"list\" or " "\"string\""); /* Find a match for the method name. */ for (epp = conn->config_entries; (*epp)->method != NULL; ++epp) if (strcmp((*epp)->method, method) == 0) break; if ((*epp)->method == NULL) WT_RET_MSG(session, WT_NOTFOUND, "no method matching %s found", method); /* * Technically possible for threads to race, lock the connection while * adding the new configuration information. We're holding the lock * for an extended period of time, but configuration changes should be * rare and only happen during startup. */ __wt_spin_lock(session, &conn->api_lock); /* * Allocate new configuration entry and fill it in. * * The new base value is the previous base value, a separator and the * new configuration string. */ WT_ERR(__wt_calloc_one(session, &entry)); entry->method = (*epp)->method; WT_ERR(__wt_calloc_def(session, strlen((*epp)->base) + strlen(",") + strlen(config) + 1, &p)); (void)strcpy(p, (*epp)->base); (void)strcat(p, ","); (void)strcat(p, config); entry->base = p; /* * There may be a default value in the config argument passed in (for * example, (kvs_parallelism=64"). The default value isn't part of the * name, build a new one. */ WT_ERR(__wt_strdup(session, config, &newcheck_name)); if ((p = strchr(newcheck_name, '=')) != NULL) *p = '\0'; /* * The new configuration name may replace an existing check with new * information, in that case skip the old version. */ cnt = 0; if ((*epp)->checks != NULL) for (cp = (*epp)->checks; cp->name != NULL; ++cp) ++cnt; WT_ERR(__wt_calloc_def(session, cnt + 2, &checks)); cnt = 0; if ((*epp)->checks != NULL) for (cp = (*epp)->checks; cp->name != NULL; ++cp) if (strcmp(newcheck_name, cp->name) != 0) checks[cnt++] = *cp; newcheck = &checks[cnt]; newcheck->name = newcheck_name; WT_ERR(__wt_strdup(session, type, &newcheck->type)); if (check != NULL) WT_ERR(__wt_strdup(session, check, &newcheck->checks)); entry->checks = checks; /* * Confirm the configuration string passes the new set of * checks. */ WT_ERR(config_check(session, entry->checks, config, 0)); /* * The next time this configuration is updated, we don't want to figure * out which of these pieces of memory were allocated and will need to * be free'd on close (this isn't a heavily used API and it's too much * work); add them all to the free-on-close list now. We don't check * for errors deliberately, we'd have to figure out which elements have * already been added to the free-on-close array and which have not in * order to avoid freeing chunks of memory twice. Again, this isn't a * commonly used API and it shouldn't ever happen, just leak it. */ (void)__conn_foc_add(session, entry->base); (void)__conn_foc_add(session, entry); (void)__conn_foc_add(session, checks); (void)__conn_foc_add(session, newcheck->type); (void)__conn_foc_add(session, newcheck->checks); (void)__conn_foc_add(session, newcheck_name); /* * Instead of using locks to protect configuration information, assume * we can atomically update a pointer to a chunk of memory, and because * a pointer is never partially written, readers will correctly see the * original or new versions of the memory. Readers might be using the * old version as it's being updated, though, which means we cannot free * the old chunk of memory until all possible readers have finished. * Currently, that's on connection close: in other words, we can use * this because it's small amounts of memory, and we really, really do * not want to acquire locks every time we access configuration strings, * since that's done on every API call. */ WT_PUBLISH(*epp, entry); if (0) { err: if (entry != NULL) { __wt_free(session, entry->base); __wt_free(session, entry); } __wt_free(session, checks); if (newcheck != NULL) { __wt_free(session, newcheck->type); __wt_free(session, newcheck->checks); } __wt_free(session, newcheck_name); } __wt_spin_unlock(session, &conn->api_lock); return (ret); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { struct timespec end, start; WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_snap_min; uint32_t flags; conn = S2C(session); btree = S2BT(session); walk = NULL; txn = &session->txn; saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min; flags = WT_READ_CACHE | WT_READ_NO_GEN; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) WT_RET(__wt_epoch(session, &start)); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } /* * Save the oldest transaction ID we need to keep around. * Otherwise, in a busy system, we could be updating pages so * fast that write leaves never catches up. We deliberately * have no transaction running at this point that would keep * the oldest ID from moving forwards as we walk the tree. */ oldest_id = __wt_txn_oldest_id(session); flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write hot pages (defined as pages that have * been updated since the write phase leaves started): * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; case WT_SYNC_CHECKPOINT: /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make * schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. * We don't update the snapshot in between pages because (a) * the metadata shouldn't be that big, and (b) if we do ever */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * In the final checkpoint pass, child pages cannot be evicted * from underneath internal pages nor can underlying blocks be * freed until the checkpoint's block lists are stable. Also, * we cannot split child pages into parents unless we know the * final pass will write a consistent view of that namespace. * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE); WT_ERR(__wt_evict_file_exclusive_on(session)); __wt_evict_file_exclusive_off(session); WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING); /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* Skip clean pages. */ if (!__wt_page_is_modified(walk->page)) continue; /* * Take a local reference to the page modify structure * now that we know the page is dirty. It needs to be * done in this order otherwise the page modify * structure could have been created between taking the * reference and checking modified. */ page = walk->page; mod = page->modify; /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. * * We can skip dirty pages if: * (1) they are leaf pages; * (2) there is a snapshot transaction active (which * is the case in ordinary application checkpoints * but not all internal cases); and * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. * * Mark the tree dirty: the checkpoint marked it clean * and we can't skip future checkpoints until this page * is written. */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { __wt_page_modify_set(session, page); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: WT_ILLEGAL_VALUE_ERR(session); } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_ERR(__wt_epoch(session, &end)); WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 " bytes, %" PRIu64 " pages of internal\n\t" "Took: %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, WT_TIMEDIFF_MS(end, start))); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); /* * If we got a snapshot in order to write pages, and there was no * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && saved_snap_min == WT_TXN_NONE) __wt_txn_release_snapshot(session); if (btree->checkpointing != WT_CKPT_OFF) { /* * Update the checkpoint generation for this handle so visible * updates newer than the checkpoint can be evicted. * * This has to be published before eviction is enabled again, * so that eviction knows that the checkpoint has completed. */ WT_PUBLISH(btree->checkpoint_gen, conn->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. */ btree->checkpointing = WT_CKPT_OFF; WT_FULL_BARRIER(); /* * If this tree was being skipped by the eviction server during * the checkpoint, clear the wait. */ btree->evict_walk_period = 0; /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ WT_TRET(__wt_evict_server_wake(session)); } __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, true)); return (ret); }
/* * __page_read -- * Read a page from the file. */ static int __page_read(WT_SESSION_IMPL *session, WT_REF *ref) { const WT_PAGE_HEADER *dsk; WT_BTREE *btree; WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; size_t addr_size; uint32_t previous_state; const uint8_t *addr; btree = S2BT(session); page = NULL; /* * Don't pass an allocated buffer to the underlying block read function, * force allocation of new memory of the appropriate size. */ WT_CLEAR(tmp); /* * Attempt to set the state to WT_REF_READING for normal reads, or * WT_REF_LOCKED, for deleted pages. If successful, we've won the * race, read the page. */ if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING)) previous_state = WT_REF_DISK; else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) previous_state = WT_REF_DELETED; else return (0); /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; goto done; } /* * There's an address, read or map the backing disk page and build an * in-memory version of the page. */ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); /* * Clear the local reference to an allocated copy of the disk image on * return; the page steals it, errors in this code should not free it. */ tmp.mem = NULL; /* * If reading for a checkpoint, there's no additional work to do, the * page on disk is correct as written. */ if (session->dhandle->checkpoint != NULL) goto done; /* If the page was deleted, instantiate that information. */ if (previous_state == WT_REF_DELETED) WT_ERR(__wt_delete_page_instantiate(session, ref)); /* * Instantiate updates from the database's lookaside table. The page * flag was set when the page was written, potentially a long time ago. * We only care if the lookaside table is currently active, check that * before doing any work. */ dsk = tmp.data; if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) { WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside); WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside); WT_ERR(__las_page_instantiate( session, ref, btree->id, addr, addr_size)); } done: WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* * If the function building an in-memory version of the page failed, * it discarded the page, but not the disk image. Discard the page * and separately discard the disk image in all cases. */ if (ref->page != NULL) __wt_ref_out(session, ref); WT_PUBLISH(ref->state, previous_state); __wt_buf_free(session, &tmp); return (ret); }
/* * __wt_txn_release -- * Release the resources associated with the current transaction. */ void __wt_txn_release(WT_SESSION_IMPL *session) { WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; int was_oldest; txn = &session->txn; WT_ASSERT(session, txn->mod_count == 0); txn->notify = NULL; txn_global = &S2C(session)->txn_global; txn_state = WT_SESSION_TXN_STATE(session); was_oldest = 0; /* Clear the transaction's ID from the global table. */ if (WT_SESSION_IS_CHECKPOINT(session)) { WT_ASSERT(session, txn_state->id == WT_TXN_NONE); txn->id = WT_TXN_NONE; /* Clear the global checkpoint transaction IDs. */ txn_global->checkpoint_id = 0; txn_global->checkpoint_pinned = WT_TXN_NONE; } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running)); WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); /* Quick check for the oldest transaction. */ was_oldest = (txn->id == txn_global->last_running); txn->id = WT_TXN_NONE; } /* Free the scratch buffer allocated for logging. */ __wt_logrec_free(session, &txn->logrec); /* Discard any memory from the session's split stash that we can. */ WT_ASSERT(session, session->split_gen == 0); if (session->split_stash_cnt > 0) __wt_split_stash_discard(session); /* * Reset the transaction state to not running and release the snapshot. */ __wt_txn_release_snapshot(session); txn->isolation = session->isolation; /* Ensure the transaction flags are cleared on exit */ txn->flags = 0; /* * When the oldest transaction in the system completes, bump the oldest * ID. This is racy and so not guaranteed, but in practice it keeps * the oldest ID from falling too far behind. */ if (was_oldest) __wt_txn_update_oldest(session, 1); }
/* * __wt_hazard_set -- * Set a hazard pointer. */ int __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_HAZARD *hp; int restarts = 0; btree = S2BT(session); conn = S2C(session); *busyp = false; /* If a file can never be evicted, hazard pointers aren't required. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) return (0); /* * Do the dance: * * The memory location which makes a page "real" is the WT_REF's state * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the * page eviction server. * * Add the WT_REF reference to the session's hazard list and flush the * write, then see if the page's state is still valid. If so, we can * use the page because the page eviction server will see our hazard * pointer before it discards the page (the eviction server sets the * state to WT_REF_LOCKED, then flushes memory and checks the hazard * pointers). * * For sessions with many active hazard pointers, skip most of the * active slots: there may be a free slot in there, but checking is * expensive. Most hazard pointers are released quickly: optimize * for that case. */ for (hp = session->hazard + session->nhazard;; ++hp) { /* * If we get to the end of the array, either: * 1. If we know there are free slots somewhere, and this is * the first time through, continue the search from the * start. Don't actually continue the loop because that * will skip the first slot. * 2. If we have searched all the way through and we have * allocated the maximum number of slots, give up. * 3. Allocate another increment of slots, up to the maximum. * The slot we are on should now be available. */ if (hp >= session->hazard + session->hazard_size) { if (session->nhazard < session->hazard_size && restarts++ == 0) hp = session->hazard; else if (session->hazard_size >= conn->hazard_max) break; else WT_PUBLISH(session->hazard_size, WT_MIN( session->hazard_size + WT_HAZARD_INCR, conn->hazard_max)); } if (hp->page != NULL) continue; hp->page = ref->page; #ifdef HAVE_DIAGNOSTIC hp->file = file; hp->line = line; #endif /* Publish the hazard pointer before reading page's state. */ WT_FULL_BARRIER(); /* * Check if the page state is still valid, where valid means a * state of WT_REF_MEM and the pointer is unchanged. (The * pointer can change, it means the page was evicted between * the time we set our hazard pointer and the publication. It * would theoretically be possible for the page to be evicted * and a different page read into the same memory, so the * pointer hasn't changed but the contents have. That's OK, we * found this page using the tree's key space, whatever page we * find here is the page for us to use.) */ if (ref->page == hp->page && ref->state == WT_REF_MEM) { ++session->nhazard; return (0); } /* * The page isn't available, it's being considered for eviction * (or being evicted, for all we know). If the eviction server * sees our hazard pointer before evicting the page, it will * return the page to use, no harm done, if it doesn't, it will * go ahead and complete the eviction. * * We don't bother publishing this update: the worst case is we * prevent some random page from being evicted. */ hp->page = NULL; *busyp = true; return (0); } __wt_errx(session, "session %p: hazard pointer table full", (void *)session); #ifdef HAVE_DIAGNOSTIC __hazard_dump(session); #endif return (ENOMEM); }
/* * __wt_delete_page -- * If deleting a range, try to delete the page without instantiating it. */ int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_DECL_RET; WT_PAGE *parent; *skipp = false; /* If we have a clean page in memory, attempt to evict it. */ if (ref->state == WT_REF_MEM && __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { WT_PUBLISH(ref->state, WT_REF_MEM); return (0); } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); ret = __wt_evict(session, ref, false); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); } /* * Atomically switch the page's state to lock it. If the page is not * on-disk, other threads may be using it, no fast delete. */ if (ref->state != WT_REF_DISK || !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* * We cannot fast-delete pages that have overflow key/value items as * the overflow blocks have to be discarded. The way we figure that * out is to check the page's cell type, cells for leaf pages without * overflow items are special. * * To look at an on-page cell, we need to look at the parent page, and * that's dangerous, our parent page could change without warning if * the parent page were to split, deepening the tree. It's safe: the * page's reference will always point to some valid page, and if we find * any problems we simply fail the fast-delete optimization. */ parent = ref->home; if (__wt_off_page(parent, ref->addr) ? ((WT_ADDR *)ref->addr)->type != WT_ADDR_LEAF_NO : __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO) goto err; /* * This action dirties the parent page: mark it dirty now, there's no * future reconciliation of the child leaf page that will dirty it as * we write the tree. */ WT_ERR(__wt_page_parent_modify_set(session, ref, false)); /* * Record the change in the transaction structure and set the change's * transaction ID. */ WT_ERR(__wt_calloc_one(session, &ref->page_del)); ref->page_del->txnid = session->txn.id; WT_ERR(__wt_txn_modify_ref(session, ref)); *skipp = true; WT_STAT_CONN_INCR(session, rec_page_delete_fast); WT_STAT_DATA_INCR(session, rec_page_delete_fast); WT_PUBLISH(ref->state, WT_REF_DELETED); return (0); err: __wt_free(session, ref->page_del); /* * Restore the page to on-disk status, we'll have to instantiate it. */ WT_PUBLISH(ref->state, WT_REF_DISK); return (ret); }
/* * __wt_col_append_serial_func -- * Server function to append an WT_INSERT entry to the tree. */ int __wt_col_append_serial_func(WT_SESSION_IMPL *session, void *args) { WT_BTREE *btree; WT_INSERT *ins, *new_ins, ***ins_stack, **next_stack; WT_INSERT_HEAD *inshead, **insheadp, **new_inslist, *new_inshead; WT_PAGE *page; uint64_t recno; uint32_t write_gen; u_int i, skipdepth; btree = S2BT(session); __wt_col_append_unpack(args, &page, &write_gen, &insheadp, &ins_stack, &next_stack, &new_inslist, &new_inshead, &new_ins, &skipdepth); /* Check the page's write-generation. */ WT_RET(__wt_page_write_gen_check(session, page, write_gen)); if ((inshead = *insheadp) == NULL) inshead = new_inshead; /* * If the application specified a record number, there's a race: the * application may have searched for the record, not found it, then * called into the append code, and another thread might have added * the record. Fortunately, we're in the right place because if the * record didn't exist at some point, it can only have been created * on this list. Search for the record, if specified. */ if ((recno = WT_INSERT_RECNO(new_ins)) == 0) recno = WT_INSERT_RECNO(new_ins) = ++btree->last_recno; ins = __col_insert_search(inshead, ins_stack, next_stack, recno); /* If we find the record number, there's been a race. */ if (ins != NULL && WT_INSERT_RECNO(ins) == recno) WT_RET(WT_RESTART); /* * Publish: First, point the new WT_INSERT item's skiplist references * to the next elements in the insert list, then flush memory. Second, * update the skiplist elements that reference the new WT_INSERT item, * this ensures the list is never inconsistent. */ for (i = 0; i < skipdepth; i++) new_ins->next[i] = *ins_stack[i]; WT_WRITE_BARRIER(); for (i = 0; i < skipdepth; i++) { if (inshead->tail[i] == NULL || ins_stack[i] == &inshead->tail[i]->next[i]) inshead->tail[i] = new_ins; *ins_stack[i] = new_ins; } __wt_col_append_new_ins_taken(args); /* * If the insert head does not yet have an insert list, our caller * passed us one. * * NOTE: it is important to do this after the item has been added to * the list. Code can assume that if the list is set, it is non-empty. */ if (*insheadp == NULL) { WT_PUBLISH(*insheadp, new_inshead); __wt_col_append_new_inshead_taken(args); } /* * If the page does not yet have an insert array, our caller passed * us one. * * NOTE: it is important to do this after publishing the list entry. * Code can assume that if the array is set, it is non-empty. */ if (page->modify->append == NULL) { page->modify->append = new_inslist; __wt_col_append_new_inslist_taken(args); } /* * If we don't find the record, check to see if we extended the file, * and update the last record number. */ if (recno > btree->last_recno) btree->last_recno = recno; __wt_page_and_tree_modify_set(session, page); return (0); }
/* * __rec_page_dirty_update -- * Update a dirty page's reference on eviction. */ static int __rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_ADDR *addr; WT_PAGE_MODIFY *mod; WT_REF *parent_ref; mod = page->modify; parent_ref = page->ref; switch (F_ISSET(mod, WT_PM_REC_MASK)) { case WT_PM_REC_EMPTY: /* Page is empty */ if (parent_ref->addr != NULL && __wt_off_page(page->parent, parent_ref->addr)) { __wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr); __wt_free(session, parent_ref->addr); } /* * Update the parent to reference an empty page. * * Set the transaction ID to WT_TXN_NONE because the fact that * reconciliation left the page "empty" means there's no older * transaction in the system that might need to see an earlier * version of the page. It isn't necessary (WT_TXN_NONE is 0), * but it's the right thing to do. * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ parent_ref->page = NULL; parent_ref->addr = NULL; parent_ref->txnid = WT_TXN_NONE; WT_PUBLISH(parent_ref->state, WT_REF_DELETED); break; case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ if (parent_ref->addr != NULL && __wt_off_page(page->parent, parent_ref->addr)) { __wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr); __wt_free(session, parent_ref->addr); } /* * Update the parent to reference the replacement page. * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); *addr = mod->u.replace; mod->u.replace.addr = NULL; mod->u.replace.size = 0; parent_ref->page = NULL; parent_ref->addr = addr; WT_PUBLISH(parent_ref->state, WT_REF_DISK); break; case WT_PM_REC_SPLIT: /* Page split */ /* * Update the parent to reference new internal page(s). * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ parent_ref->page = mod->u.split; WT_PUBLISH(parent_ref->state, WT_REF_MEM); /* Clear the reference else discarding the page will free it. */ mod->u.split = NULL; F_CLR(mod, WT_PM_REC_SPLIT); break; WT_ILLEGAL_VALUE(session); } return (0); }