/* * __rec_review -- * Get exclusive access to the page and review the page and its subtree * for conditions that would block its eviction. * * The ref and page arguments may appear to be redundant, because usually * ref->page == page and page->ref == ref. However, we need both because * (a) there are cases where ref == NULL (e.g., for root page or during * salvage), and (b) we can't safely look at page->ref until we have a * hazard pointer. */ static int __rec_review(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page, int exclusive, int merge, int top) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE_MODIFY *mod; WT_PAGE *t; uint32_t i; btree = session->btree; /* * Get exclusive access to the page if our caller doesn't have the tree * locked down. */ if (!exclusive) WT_RET(__hazard_exclusive(session, ref, top)); /* * Recurse through the page's subtree: this happens first because we * have to write pages in depth-first order, otherwise we'll dirty * pages after we've written them. */ if (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT) WT_REF_FOREACH(page, ref, i) switch (ref->state) { case WT_REF_DISK: /* On-disk */ case WT_REF_DELETED: /* On-disk, deleted */ break; case WT_REF_MEM: /* In-memory */ WT_RET(__rec_review(session, ref, ref->page, exclusive, merge, 0)); break; case WT_REF_EVICT_WALK: /* Walk point */ case WT_REF_EVICT_FORCE: /* Forced evict */ case WT_REF_LOCKED: /* Being evicted */ case WT_REF_READING: /* Being read */ return (EBUSY); } /* * If the file is being checkpointed, we cannot evict dirty pages, * because that may free a page that appears on an internal page in the * checkpoint. Don't rely on new updates being skipped by the * transaction used for transaction reads: (1) there are paths that * dirty pages for artificial reasons; (2) internal pages aren't * transactional; and (3) if an update was skipped during the * checkpoint (leaving the page dirty), then rolled back, we could * still successfully overwrite a page and corrupt the checkpoint. * * Further, even for clean pages, the checkpoint's reconciliation of an * internal page might race with us as we evict a child in the page's * subtree. * * One half of that test is in the reconciliation code: the checkpoint * thread waits for eviction-locked pages to settle before determining * their status. The other half of the test is here: after acquiring * the exclusive eviction lock on a page, confirm no page in the page's * stack of pages from the root is being reconciled in a checkpoint. * This ensures we either see the checkpoint-walk state here, or the * reconciliation of the internal page sees our exclusive lock on the * child page and waits until we're finished evicting the child page * (or give up if eviction isn't possible). * * We must check the full stack (we might be attempting to evict a leaf * page multiple levels beneath the internal page being reconciled as * part of the checkpoint, and all of the intermediate nodes are being * merged into the internal page). * * There's no simple test for knowing if a page in our page stack is * involved in a checkpoint. The internal page's checkpoint-walk flag * is the best test, but it's not set anywhere for the root page, it's * not a complete test. * * Quit for any page that's not a simple, in-memory page. (Almost the * same as checking for the checkpoint-walk flag. I don't think there * are code paths that change the page's status from checkpoint-walk, * but these races are hard enough I'm not going to proceed if there's * anything other than a vanilla, in-memory tree stack.) Climb until * we find a page which can't be merged into its parent, and failing if * we never find such a page. */ if (btree->checkpointing && !merge && __wt_page_is_modified(page)) { ckpt: WT_CSTAT_INCR(session, cache_eviction_checkpoint); WT_DSTAT_INCR(session, cache_eviction_checkpoint); return (EBUSY); } if (btree->checkpointing && top) for (t = page->parent;; t = t->parent) { if (t == NULL || t->ref == NULL) /* root */ goto ckpt; if (t->ref->state != WT_REF_MEM) /* scary */ goto ckpt; if (t->modify == NULL || /* not merged */ !F_ISSET(t->modify, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)) break; } /* * If we are merging internal pages, we just need exclusive access, we * don't need to write everything. */ if (merge) return (0); /* * Fail if any page in the top-level page's subtree won't be merged into * its parent, the page that cannot be merged must be evicted first. * The test is necessary but should not fire much: the eviction code is * biased for leaf pages, an internal page shouldn't be selected for * eviction until its children have been evicted. * * We have to write dirty pages to know their final state, a page marked * empty may have had records added since reconciliation, a page marked * split may have had records deleted and no longer need to split. * Split-merge pages are the exception: they can never be change into * anything other than a split-merge page and are merged regardless of * being clean or dirty. * * Writing the page is expensive, do a cheap test first: if it doesn't * appear a subtree page can be merged, quit. It's possible the page * has been emptied since it was last reconciled, and writing it before * testing might be worthwhile, but it's more probable we're attempting * to evict an internal page with live children, and that's a waste of * time. */ mod = page->modify; if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))) return (EBUSY); /* * If the page is dirty and can possibly change state, write it so we * know the final state. */ if (__wt_page_is_modified(page) && !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) { ret = __wt_rec_write(session, page, NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_QUIT); /* * Update the page's modification reference, reconciliation * might have changed it. */ mod = page->modify; /* If there are unwritten changes on the page, give up. */ if (ret == EBUSY) { WT_VERBOSE_RET(session, evict, "eviction failed, reconciled page not clean"); /* * We may be able to discard any "update" memory the * page no longer needs. */ switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: __wt_col_leaf_obsolete(session, page); break; case WT_PAGE_ROW_LEAF: __wt_row_leaf_obsolete(session, page); break; } } WT_RET(ret); WT_ASSERT(session, __wt_page_is_modified(page) == 0); } /* * Repeat the test: fail if any page in the top-level page's subtree * won't be merged into its parent. */ if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))) return (EBUSY); return (0); }
/* * __wt_compact -- * Compact a file. */ int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; WT_REF *ref; bool block_manager_begin, skip; WT_UNUSED(cfg); btree = S2BT(session); bm = btree->bm; ref = NULL; block_manager_begin = false; WT_STAT_FAST_DATA_INCR(session, session_compact); /* * Check if compaction might be useful -- the API layer will quit trying * to compact the data source if we make no progress, set a flag if the * block layer thinks compaction is possible. */ WT_RET(bm->compact_skip(bm, session, &skip)); if (skip) return (0); /* * Reviewing in-memory pages requires looking at page reconciliation * results, because we care about where the page is stored now, not * where the page was stored when we first read it into the cache. * We need to ensure we don't race with page reconciliation as it's * writing the page modify information. * * There are three ways we call reconciliation: checkpoints, threads * writing leaf pages (usually in preparation for a checkpoint or if * closing a file), and eviction. * * We're holding the schema lock which serializes with checkpoints. */ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); /* * Get the tree handle's flush lock which blocks threads writing leaf * pages. */ __wt_spin_lock(session, &btree->flush_lock); /* Start compaction. */ WT_ERR(bm->compact_start(bm, session)); block_manager_begin = true; /* Walk the tree reviewing pages to see if they should be re-written. */ for (;;) { /* * Pages read for compaction aren't "useful"; don't update the * read generation of pages already in memory, and if a page is * read, set its generation to a low value so it is evicted * quickly. */ WT_ERR(__wt_tree_walk(session, &ref, WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED)); if (ref == NULL) break; WT_ERR(__compact_rewrite(session, ref, &skip)); if (skip) continue; session->compact_state = WT_COMPACT_SUCCESS; /* Rewrite the page: mark the page and tree dirty. */ WT_ERR(__wt_page_modify_init(session, ref->page)); __wt_page_modify_set(session, ref->page); WT_STAT_FAST_DATA_INCR(session, btree_compact_rewrite); } err: if (ref != NULL) WT_TRET(__wt_page_release(session, ref, 0)); if (block_manager_begin) WT_TRET(bm->compact_end(bm, session)); /* Unblock threads writing leaf pages. */ __wt_spin_unlock(session, &btree->flush_lock); return (ret); }
/* * __wt_tree_walk -- * Move to the next/previous page in the tree. */ int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; int prev, skip; uint32_t slot; btree = S2BT(session); /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * We may be passed a pointer to btree->evict_page that we are clearing * here. We check when discarding pages that we're not discarding that * page, so this clear must be done before the page is released. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start of the tree. */ if (ref == NULL) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } ascend: /* * If the active page was the root, we've reached the walk's end. * Release any hazard-pointer we're holding. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release(session, couple, flags)); goto done; } /* Figure out the current slot in the WT_REF array. */ __wt_page_refp(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the page, return this page * in post-order traversal. Otherwise we move to the next/prev * slot and left/right-most element in its subtree. */ if ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { ref = ref->home->pg_intl_parent_ref; /* Optionally skip internal pages. */ if (LF_ISSET(WT_READ_SKIP_INTL)) goto ascend; /* * We've ascended the tree and are returning an internal * page. If it's the root, discard our hazard pointer, * otherwise, swap our hazard pointer for the page we'll * return. */ if (__wt_ref_is_root(ref)) WT_ERR(__wt_page_release( session, couple, flags)); else { /* * Locate the reference to our parent page then * swap our child hazard pointer for the parent. * We don't handle restart or not-found returns. * It would require additional complexity and is * not a possible return: we're moving to the * parent of the current child page, our parent * reference can't have split or been evicted. */ __wt_page_refp(session, ref, &pindex, &slot); if ((ret = __wt_page_swap( session, couple, ref, flags)) != 0) { WT_TRET(__wt_page_release( session, couple, flags)); WT_ERR(ret); } } *refp = ref; goto done; } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { ref = pindex->index[slot]; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; } else if (LF_ISSET(WT_READ_COMPACT)) { /* * Skip deleted pages, rewriting them doesn't * seem useful. */ if (ref->state == WT_REF_DELETED) break; /* * If the page is in-memory, we want to look at * it (it may have been modified and written, * and the current location is the interesting * one in terms of compaction, not the original * location). If the page isn't in-memory, test * if the page will help with compaction, don't * read it if we don't have to. */ if (ref->state == WT_REF_DISK) { WT_ERR(__wt_compact_page_skip( session, ref, &skip)); if (skip) break; } } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref)) break; } ret = __wt_page_swap(session, couple, ref, flags); /* * Not-found is an expected return when only walking * in-cache pages. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * work if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __wt_page_refp(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ descend: couple = ref; page = ref->page; if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) { pindex = WT_INTL_INDEX_COPY(page); slot = prev ? pindex->entries - 1 : 0; } else { *refp = ref; goto done; } } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }
/* * __curjoin_entry_in_range -- * Check if a key is in the range specified by the entry, returning * WT_NOTFOUND if not. */ static int __curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, WT_ITEM *curkey, WT_CURSOR_JOIN_ITER *iter) { WT_COLLATOR *collator; WT_CURSOR_JOIN_ENDPOINT *end, *endmax; u_int pos; int cmp; bool disjunction, passed; collator = (entry->index != NULL) ? entry->index->collator : NULL; endmax = &entry->ends[entry->ends_next]; disjunction = F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION); /* * The iterator may have already satisfied some endpoint conditions. * If so and we're a disjunction, we're done. If so and we're a * conjunction, we can start past the satisfied conditions. */ if (iter == NULL) pos = 0; else { if (disjunction && iter->end_skip) return (0); pos = iter->end_pos + iter->end_skip; } for (end = &entry->ends[pos]; end < endmax; end++) { WT_RET(__wt_compare(session, collator, curkey, &end->key, &cmp)); switch (WT_CURJOIN_END_RANGE(end)) { case WT_CURJOIN_END_EQ: passed = (cmp == 0); break; case WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ: passed = (cmp >= 0); WT_ASSERT(session, iter == NULL); break; case WT_CURJOIN_END_GT: passed = (cmp > 0); if (passed && iter != NULL && pos == 0) iter->end_skip = 1; break; case WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ: passed = (cmp <= 0); break; case WT_CURJOIN_END_LT: passed = (cmp < 0); break; WT_ILLEGAL_VALUE(session, WT_CURJOIN_END_RANGE(end)); } if (!passed) { if (iter != NULL && (iter->is_equal || F_ISSET(end, WT_CURJOIN_END_LT))) { WT_RET(__curjoin_iter_bump(iter)); return (WT_NOTFOUND); } if (!disjunction) return (WT_NOTFOUND); iter = NULL; } else if (disjunction) break; } if (disjunction && end == endmax) return (WT_NOTFOUND); return (0); }
/* * __curjoin_init_bloom -- * Populate Bloom filters */ static int __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_CURSOR_JOIN_ENTRY *entry, WT_BLOOM *bloom) { WT_COLLATOR *collator; WT_CURSOR *c; WT_CURSOR_JOIN_ENDPOINT *end, *endmax; WT_DECL_ITEM(uribuf); WT_DECL_RET; WT_ITEM curkey, curvalue; size_t size; u_int skip; int cmp; const char *uri; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; c = NULL; skip = 0; if (entry->index != NULL) /* * Open the raw index. We're avoiding any references * to the main table, they may be expensive. */ uri = entry->index->source; else { /* * For joins on the main table, we just need the primary * key for comparison, we don't need any values. */ size = strlen(cjoin->table->iface.name) + 3; WT_ERR(__wt_scr_alloc(session, size, &uribuf)); WT_ERR(__wt_buf_fmt(session, uribuf, "%s()", cjoin->table->iface.name)); uri = uribuf->data; } WT_ERR(__wt_open_cursor(session, uri, &cjoin->iface, raw_cfg, &c)); /* Initially position the cursor if necessary. */ endmax = &entry->ends[entry->ends_next]; if ((end = &entry->ends[0]) < endmax) { if (F_ISSET(end, WT_CURJOIN_END_GT) || WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ) { WT_ERR(__wt_cursor_dup_position(end->cursor, c)); if (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_GE) skip = 1; } else if (F_ISSET(end, WT_CURJOIN_END_LT)) { if ((ret = c->next(c)) == WT_NOTFOUND) goto done; WT_ERR(ret); } else WT_PANIC_ERR(session, EINVAL, "fatal error in join cursor position state"); } collator = (entry->index == NULL) ? NULL : entry->index->collator; while (ret == 0) { WT_ERR(c->get_key(c, &curkey)); entry->stats.iterated++; if (entry->index != NULL) { /* * Repack so it's comparable to the * reference endpoints. */ WT_ERR(__wt_struct_repack(session, c->key_format, (entry->repack_format != NULL ? entry->repack_format : entry->index->idxkey_format), &c->key, &curkey)); } for (end = &entry->ends[skip]; end < endmax; end++) { WT_ERR(__wt_compare(session, collator, &curkey, &end->key, &cmp)); if (F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) { /* if condition satisfied, insert immediately */ switch (WT_CURJOIN_END_RANGE(end)) { case WT_CURJOIN_END_EQ: if (cmp == 0) goto insert; break; case WT_CURJOIN_END_GT: if (cmp > 0) { /* skip this check next time */ skip = entry->ends_next; goto insert; } break; case WT_CURJOIN_END_GE: if (cmp >= 0) goto insert; break; case WT_CURJOIN_END_LT: if (cmp < 0) goto insert; break; case WT_CURJOIN_END_LE: if (cmp <= 0) goto insert; break; } } else if (!F_ISSET(end, WT_CURJOIN_END_LT)) { if (cmp < 0 || (cmp == 0 && !F_ISSET(end, WT_CURJOIN_END_EQ))) goto advance; if (cmp > 0) { if (F_ISSET(end, WT_CURJOIN_END_GT)) skip = 1; else goto done; } } else { if (cmp > 0 || (cmp == 0 && !F_ISSET(end, WT_CURJOIN_END_EQ))) goto done; } } /* * Either it's a disjunction that hasn't satisfied any * condition, or it's a conjunction that has satisfied all * conditions. */ if (F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) goto advance; insert: if (entry->index != NULL) { curvalue.data = (unsigned char *)curkey.data + curkey.size; WT_ASSERT(session, c->key.size > curkey.size); curvalue.size = c->key.size - curkey.size; } else WT_ERR(c->get_key(c, &curvalue)); __wt_bloom_insert(bloom, &curvalue); entry->stats.bloom_insert++; advance: if ((ret = c->next(c)) == WT_NOTFOUND) break; } done: WT_ERR_NOTFOUND_OK(ret); err: if (c != NULL) WT_TRET(c->close(c)); __wt_scr_free(session, &uribuf); return (ret); }
/* * __wt_schema_project_merge -- * Given list of cursors and a projection, build a buffer containing the * column values read from the cursors. */ int __wt_schema_project_merge(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value) { WT_CURSOR *c; WT_DECL_PACK(pack); WT_DECL_PACK_VALUE(pv); WT_DECL_PACK_VALUE(vpv); WT_ITEM *buf; WT_PACK vpack; u_long arg; char *proj; const uint8_t *p, *end; uint8_t *vp; size_t len; p = end = NULL; /* -Wuninitialized */ WT_RET(__wt_buf_init(session, value, 0)); WT_RET(__pack_init(session, &vpack, vformat)); for (proj = (char *)proj_arg; *proj != '\0'; proj++) { arg = strtoul(proj, &proj, 10); switch (*proj) { case WT_PROJ_KEY: c = cp[arg]; if (WT_CURSOR_RECNO(c)) { c->key.data = &c->recno; c->key.size = sizeof(c->recno); WT_RET(__pack_init(session, &pack, "R")); } else WT_RET(__pack_init( session, &pack, c->key_format)); buf = &c->key; p = buf->data; end = p + buf->size; continue; case WT_PROJ_VALUE: c = cp[arg]; WT_RET(__pack_init(session, &pack, c->value_format)); buf = &c->value; p = buf->data; end = p + buf->size; continue; } /* * Otherwise, the argument is a count, where a missing * count means a count of 1. */ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) { switch (*proj) { case WT_PROJ_NEXT: case WT_PROJ_SKIP: case WT_PROJ_REUSE: WT_RET(__pack_next(&pack, &pv)); WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p))); /* Only copy the value out once. */ if (*proj != WT_PROJ_NEXT) break; WT_RET(__pack_next(&vpack, &vpv)); /* Make sure the types are compatible. */ WT_ASSERT(session, __wt_tolower((u_char)pv.type) == __wt_tolower((u_char)vpv.type)); vpv.u = pv.u; WT_RET(__pack_size(session, &vpv, &len)); WT_RET(__wt_buf_grow(session, value, value->size + len)); vp = (uint8_t *)value->mem + value->size; WT_RET(__pack_write(session, &vpv, &vp, len)); value->size += len; break; } } } return (0); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_prev); WT_STAT_FAST_DATA_INCR(session, cursor_prev); flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */ if (truncating) LF_SET(WT_READ_TRUNCATE); WT_RET(__cursor_func_init(cbt, 0)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt, 0); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ page = cbt->ref == NULL ? NULL : cbt->ref->page; for (newpage = 0;; newpage = 1) { if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = 1; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); page = cbt->ref->page; WT_ASSERT(session, page->type != WT_PAGE_COL_INT && page->type != WT_PAGE_ROW_INT); /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); } err: if (ret != 0) WT_TRET(__cursor_error_resolve(cbt)); return (ret); }
/* * __wt_lsm_meta_write -- * Write the metadata for an LSM tree. */ int __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_CHUNK *chunk; u_int i; int first; WT_RET(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "key_format=%s,value_format=%s,bloom_config=(%s),file_config=(%s)", lsm_tree->key_format, lsm_tree->value_format, lsm_tree->bloom_config, lsm_tree->file_config)); if (lsm_tree->collator_name != NULL) WT_ERR(__wt_buf_catfmt( session, buf, ",collator=%s", lsm_tree->collator_name)); WT_ERR(__wt_buf_catfmt(session, buf, ",last=%" PRIu32 ",chunk_count_limit=%" PRIu32 ",chunk_max=%" PRIu64 ",chunk_size=%" PRIu64 ",auto_throttle=%" PRIu32 ",merge_max=%" PRIu32 ",merge_min=%" PRIu32 ",bloom=%" PRIu32 ",bloom_bit_count=%" PRIu32 ",bloom_hash_count=%" PRIu32, lsm_tree->last, lsm_tree->chunk_count_limit, lsm_tree->chunk_max, lsm_tree->chunk_size, F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) ? 1 : 0, lsm_tree->merge_max, lsm_tree->merge_min, lsm_tree->bloom, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count)); WT_ERR(__wt_buf_catfmt(session, buf, ",chunks=[")); for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; if (i > 0) WT_ERR(__wt_buf_catfmt(session, buf, ",")); WT_ERR(__wt_buf_catfmt(session, buf, "id=%" PRIu32, chunk->id)); if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_buf_catfmt(session, buf, ",bloom")); if (chunk->size != 0) WT_ERR(__wt_buf_catfmt(session, buf, ",chunk_size=%" PRIu64, chunk->size)); if (chunk->count != 0) WT_ERR(__wt_buf_catfmt( session, buf, ",count=%" PRIu64, chunk->count)); WT_ERR(__wt_buf_catfmt( session, buf, ",generation=%" PRIu32, chunk->generation)); } WT_ERR(__wt_buf_catfmt(session, buf, "]")); WT_ERR(__wt_buf_catfmt(session, buf, ",old_chunks=[")); first = 1; for (i = 0; i < lsm_tree->nold_chunks; i++) { chunk = lsm_tree->old_chunks[i]; WT_ASSERT(session, chunk != NULL); if (first) first = 0; else WT_ERR(__wt_buf_catfmt(session, buf, ",")); WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri)); if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_buf_catfmt( session, buf, ",bloom=\"%s\"", chunk->bloom_uri)); } WT_ERR(__wt_buf_catfmt(session, buf, "]")); ret = __wt_metadata_update(session, lsm_tree->name, buf->data); WT_ERR(ret); err: __wt_scr_free(session, &buf); return (ret); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; bool newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_prev); WT_STAT_FAST_DATA_INCR(session, cursor_prev); flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */ if (truncating) LF_SET(WT_READ_TRUNCATE); WT_RET(__cursor_func_init(cbt, false)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = true; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } /* * If we saw a lot of deleted records on this page, or we went * all the way through a page and only saw deleted records, try * to evict the page when we release it. Otherwise repeatedly * deleting from the beginning of a tree can have quadratic * performance. Take care not to force eviction of pages that * are genuinely empty, in new trees. */ if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))) __wt_page_evict_soon(page); cbt->page_deleted_count = 0; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_check(session, cbt, false)); #endif err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_lsm_tree_throttle -- * Calculate whether LSM updates need to be throttled. Must be called * with the LSM tree lock held. */ void __wt_lsm_tree_throttle( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only) { WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk; uint64_t cache_sz, cache_used, oldtime, record_count, timediff; uint32_t in_memory, gen0_chunks; /* Never throttle in small trees. */ if (lsm_tree->nchunks < 3) { lsm_tree->ckpt_throttle = lsm_tree->merge_throttle = 0; return; } cache_sz = S2C(session)->cache_size; /* * In the steady state, we expect that the checkpoint worker thread * will keep up with inserts. If not, throttle the insert rate to * avoid filling the cache with in-memory chunks. Threads sleep every * 100 operations, so take that into account in the calculation. * * Also throttle based on whether merge threads are keeping up. If * there are enough chunks that have never been merged we slow down * inserts so that merges have some chance of keeping up. * * Count the number of in-memory chunks, the number of unmerged chunk * on disk, and find the most recent on-disk chunk (if any). */ record_count = 1; gen0_chunks = in_memory = 0; ondisk = NULL; for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1; cp >= lsm_tree->chunk; --cp) if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) { record_count += (*cp)->count; ++in_memory; } else { /* * Assign ondisk to the last chunk that has been * flushed since the tree was last opened (i.e it's on * disk and stable is not set). */ if (ondisk == NULL && ((*cp)->generation == 0 && !F_ISSET(*cp, WT_LSM_CHUNK_STABLE))) ondisk = *cp; if ((*cp)->generation == 0 && !F_ISSET(*cp, WT_LSM_CHUNK_MERGING)) ++gen0_chunks; } last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]; /* Checkpoint throttling, based on the number of in-memory chunks. */ if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3) lsm_tree->ckpt_throttle = 0; else if (decrease_only) ; /* Nothing to do */ else if (ondisk == NULL) { /* * No checkpoint has completed this run. Keep slowing down * inserts until one does. */ lsm_tree->ckpt_throttle = WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle); } else { WT_ASSERT(session, WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0); timediff = WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts); lsm_tree->ckpt_throttle = (long)((in_memory - 2) * timediff / (20 * record_count)); /* * Get more aggressive as the number of in memory chunks * consumes a large proportion of the cache. In memory chunks * are allowed to grow up to twice as large as the configured * value when checkpoints aren't keeping up. That worst case * is when this calculation is relevant. * There is nothing particularly special about the chosen * multipliers. */ cache_used = in_memory * lsm_tree->chunk_size * 2; if (cache_used > cache_sz * 0.8) lsm_tree->ckpt_throttle *= 5; } /* * Merge throttling, based on the number of on-disk, level 0 chunks. * * Don't throttle if the tree has less than a single level's number * of chunks. */ if (lsm_tree->nchunks < lsm_tree->merge_max) lsm_tree->merge_throttle = 0; else if (gen0_chunks < WT_LSM_MERGE_THROTTLE_THRESHOLD) WT_LSM_MERGE_THROTTLE_DECREASE(lsm_tree->merge_throttle); else if (!decrease_only) WT_LSM_MERGE_THROTTLE_INCREASE(lsm_tree->merge_throttle); /* Put an upper bound of 1s on both throttle calculations. */ lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle); lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle); /* * Update our estimate of how long each in-memory chunk stays active. * Filter out some noise by keeping a weighted history of the * calculated value. Wait until we have enough chunks that we can * check that the new value is sane: otherwise, after a long idle * period, we can calculate a crazy value. */ if (in_memory > 1 && ondisk != NULL) { prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2]; WT_ASSERT(session, prev_chunk->generation == 0); WT_ASSERT(session, WT_TIMECMP( last_chunk->create_ts, prev_chunk->create_ts) >= 0); timediff = WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts); WT_ASSERT(session, WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0); oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts); if (timediff < 10 * oldtime) lsm_tree->chunk_fill_ms = (3 * lsm_tree->chunk_fill_ms + timediff / 1000000) / 4; } }
/* * __wt_block_write_off -- * Write a buffer into a block, returning the block's offset, size and * checksum. */ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; size_t align_size; wt_off_t offset; int local_locked; blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; local_locked = 0; /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = WT_STORE_SIZE(align_size); /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. */ blk->flags = 0; if (data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->cksum = 0; blk->cksum = __wt_cksum( buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); if (!caller_locked) { WT_RET(__wt_block_ext_prealloc(session, 5)); __wt_spin_lock(session, &block->live_lock); local_locked = 1; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); /* * Extend the file in chunks. We want to limit the number of threads * extending the file at the same time, so choose the one thread that's * crossing the extended boundary. We don't extend newly created files, * and it's theoretically possible we might wait so long our extension * of the file is passed by another thread writing single blocks, that's * why there's a check in case the extended file size becomes too small: * if the file size catches up, every thread tries to extend it. * * File extension may require locking: some variants of the system call * used to extend the file initialize the extended space. If a writing * thread races with the extending thread, the extending thread might * overwrite already written data, and that would be very, very bad. * * Some variants of the system call to extend the file fail at run-time * based on the filesystem type, fall back to ftruncate in that case, * and remember that ftruncate requires locking. */ if (ret == 0 && fh->extend_len != 0 && (fh->extend_size <= fh->size || (offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + (wt_off_t)align_size >= fh->extend_size))) { fh->extend_size = offset + fh->extend_len * 2; if (fh->fallocate_available != WT_FALLOCATE_NOT_AVAILABLE) { /* * Release any locally acquired lock if it's not needed * to extend the file, extending the file might require * updating file metadata, which can be slow. (It may be * a bad idea to configure for file extension on systems * that require locking over the extend call.) */ if (!fh->fallocate_requires_locking && local_locked) { __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } /* Extend the file. */ if ((ret = __wt_fallocate(session, fh, offset, fh->extend_len * 2)) == ENOTSUP) { ret = 0; goto extend_truncate; } } else { extend_truncate: /* * We may have a caller lock or a locally acquired lock, * but we need a lock to call ftruncate. */ if (!caller_locked && local_locked == 0) { __wt_spin_lock(session, &block->live_lock); local_locked = 1; } /* * The truncate might fail if there's a file mapping * (if there's an open checkpoint on the file), that's * OK. */ if ((ret = __wt_ftruncate( session, fh, offset + fh->extend_len * 2)) == EBUSY) ret = 0; } } /* Release any locally acquired lock. */ if (local_locked) { __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } WT_RET(ret); /* Write the block. */ if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); WT_TRET(__wt_block_off_free( session, block, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /* * Optionally schedule writes for dirty pages in the system buffer * cache, but only if the current session can wait. */ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && __wt_session_can_wait(session)) { block->os_cache_dirty = 0; WT_RET(__wt_fsync_async(session, fh)); } #endif #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_STAT_FAST_CONN_INCR(session, block_write); WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); WT_RET(__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, (uintmax_t)offset, (uintmax_t)align_size, blk->cksum)); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *cksump = blk->cksum; return (ret); }
/* * __lsm_tree_open -- * Open an LSM tree structure. */ static int __lsm_tree_open( WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LSM_TREE *lsm_tree; conn = S2C(session); lsm_tree = NULL; WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); /* Start the LSM manager thread if it isn't running. */ if (WT_ATOMIC_CAS4(conn->lsm_manager.lsm_workers, 0, 1)) WT_RET(__wt_lsm_manager_start(session)); /* Make sure no one beat us to it. */ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) if (strcmp(uri, lsm_tree->name) == 0) { *treep = lsm_tree; return (0); } /* Try to open the tree. */ WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); WT_ERR(__wt_rwlock_alloc(session, &lsm_tree->rwlock, "lsm tree")); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); WT_ERR(__wt_lsm_meta_read(session, lsm_tree)); /* * Sanity check the configuration. Do it now since this is the first * time we have the LSM tree configuration. */ WT_ERR(__lsm_tree_open_check(session, lsm_tree)); if (lsm_tree->nchunks == 0) { F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); WT_ERR(__wt_lsm_tree_switch(session, lsm_tree)); } /* Set the generation number so cursors are opened on first usage. */ lsm_tree->dsk_gen = 1; /* * Setup reference counting. Use separate reference counts for tree * handles and queue entries, so that queue entries don't interfere * with getting handles exclusive. */ lsm_tree->refcnt = 1; lsm_tree->queue_ref = 0; /* Set a flush timestamp as a baseline. */ WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts)); /* Now the tree is setup, make it visible to others. */ TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q); F_SET(lsm_tree, WT_LSM_TREE_ACTIVE | WT_LSM_TREE_OPEN); *treep = lsm_tree; if (0) { err: WT_TRET(__lsm_tree_discard(session, lsm_tree)); } return (ret); }
/* * __wt_lsm_compact -- * Compact an LSM tree called via __wt_schema_worker. */ int __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; time_t begin, end; uint64_t progress; int i, compacting, flushing, locked, ref; compacting = flushing = locked = ref = 0; chunk = NULL; /* * This function is applied to all matching sources: ignore anything * that is not an LSM tree. */ if (!WT_PREFIX_MATCH(name, "lsm:")) return (0); /* Tell __wt_schema_worker not to look inside the LSM tree. */ *skip = 1; WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree)); if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) WT_ERR_MSG(session, EINVAL, "LSM compaction requires active merge threads"); WT_ERR(__wt_seconds(session, &begin)); /* * Compacting has two distinct phases. * 1. All in-memory chunks up to and including the current * current chunk must be flushed. Normally, the flush code * does not flush the last, in-use chunk, so we set a force * flag to include that last chunk. We monitor the state of the * last chunk and periodically push another forced flush work * unit until it is complete. * 2. After all flushing is done, we move onto the merging * phase for compaction. Again, we monitor the state and * continue to push merge work units until all merging is done. */ /* Lock the tree: single-thread compaction. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); locked = 1; /* Clear any merge throttle: compact throws out that calculation. */ lsm_tree->merge_throttle = 0; lsm_tree->merge_aggressiveness = 0; progress = lsm_tree->merge_progressing; /* If another thread started a compact on this tree, we're done. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) goto err; /* * Set the switch transaction on the current chunk, if it * hasn't been set before. This prevents further writes, so it * can be flushed by the checkpoint worker. */ if (lsm_tree->nchunks > 0 && (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL) { if (chunk->switch_txn == WT_TXN_NONE) chunk->switch_txn = __wt_txn_new_id(session); /* * If we have a chunk, we want to look for it to be on-disk. * So we need to add a reference to keep it available. */ (void)WT_ATOMIC_ADD4(chunk->refcnt, 1); ref = 1; } locked = 0; WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (chunk != NULL) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact force flush %s flags 0x%" PRIx32 " chunk %u flags 0x%" PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags)); flushing = 1; /* * Make sure the in-memory chunk gets flushed do not push a * switch, because we don't want to create a new in-memory * chunk if the tree is being used read-only now. */ WT_ERR(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree)); } else { /* * If there is no chunk to flush, go straight to the * compacting state. */ compacting = 1; progress = lsm_tree->merge_progressing; F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "COMPACT: Start compacting %s", lsm_tree->name)); } /* Wait for the work unit queues to drain. */ while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) { /* * The flush flag is cleared when the chunk has been flushed. * Continue to push forced flushes until the chunk is on disk. * Once it is on disk move to the compacting phase. */ if (flushing) { WT_ASSERT(session, chunk != NULL); if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact flush done %s chunk %u. " "Start compacting progress %" PRIu64, name, chunk->id, lsm_tree->merge_progressing)); (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); flushing = ref = 0; compacting = 1; F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); progress = lsm_tree->merge_progressing; } else { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact flush retry %s chunk %u", name, chunk->id)); WT_ERR(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree)); } } /* * The compacting flag is cleared when no merges can be done. * Ensure that we push through some aggressive merges before * stopping otherwise we might not do merges that would * span chunks with different generations. */ if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) { if (lsm_tree->merge_aggressiveness < 10 || (progress < lsm_tree->merge_progressing) || lsm_tree->merge_syncing) { progress = lsm_tree->merge_progressing; F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); lsm_tree->merge_aggressiveness = 10; } else break; } __wt_sleep(1, 0); WT_ERR(__wt_seconds(session, &end)); if (session->compact->max_time > 0 && session->compact->max_time < (uint64_t)(end - begin)) { WT_ERR(ETIMEDOUT); } /* * Push merge operations while they are still getting work * done. If we are pushing merges, make sure they are * aggressive, to avoid duplicating effort. */ if (compacting) #define COMPACT_PARALLEL_MERGES 5 for (i = lsm_tree->queue_ref; i < COMPACT_PARALLEL_MERGES; i++) { lsm_tree->merge_aggressiveness = 10; WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); } } err: /* Ensure anything we set is cleared. */ if (ref) (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); if (compacting) { F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING); lsm_tree->merge_aggressiveness = 0; } if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Compact %s complete, return %d", name, ret)); __wt_lsm_tree_release(session, lsm_tree); return (ret); }
/* * __verify_dsk_row -- * Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it. */ static int __verify_dsk_row( WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk) { WT_BM *bm; WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_DECL_ITEM(current); WT_DECL_ITEM(last_ovfl); WT_DECL_ITEM(last_pfx); WT_DECL_RET; WT_ITEM *last; enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type; void *huffman; uint32_t cell_num, cell_type, i, key_cnt, prefix; uint8_t *end; int cmp; btree = S2BT(session); bm = btree->bm; unpack = &_unpack; huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key; WT_ERR(__wt_scr_alloc(session, 0, ¤t)); WT_ERR(__wt_scr_alloc(session, 0, &last_pfx)); WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl)); last = last_ovfl; end = (uint8_t *)dsk + dsk->mem_size; last_cell_type = FIRST; cell_num = 0; key_cnt = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ++cell_num; /* Carefully unpack the cell. */ if (__wt_cell_unpack_safe(cell, unpack, end) != 0) { ret = __err_cell_corrupted(session, cell_num, addr); goto err; } /* Check the raw and collapsed cell types. */ WT_ERR(__err_cell_type( session, cell_num, addr, unpack->raw, dsk->type)); WT_ERR(__err_cell_type( session, cell_num, addr, unpack->type, dsk->type)); cell_type = unpack->type; /* * Check ordering relationships between the WT_CELL entries. * For row-store internal pages, check for: * two values in a row, * two keys in a row, * a value as the first cell on a page. * For row-store leaf pages, check for: * two values in a row, * a value as the first cell on a page. */ switch (cell_type) { case WT_CELL_KEY: case WT_CELL_KEY_OVFL: ++key_cnt; switch (last_cell_type) { case FIRST: case WAS_VALUE: break; case WAS_KEY: if (dsk->type == WT_PAGE_ROW_LEAF) break; WT_ERR_VRFY(session, "cell %" PRIu32 " on page at %s is the " "first of two adjacent keys", cell_num - 1, addr); } last_cell_type = WAS_KEY; break; case WT_CELL_ADDR_DEL: case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: case WT_CELL_VALUE: case WT_CELL_VALUE_OVFL: switch (last_cell_type) { case FIRST: WT_ERR_VRFY(session, "page at %s begins with a value", addr); case WAS_KEY: break; case WAS_VALUE: WT_ERR_VRFY(session, "cell %" PRIu32 " on page at %s is the " "first of two adjacent values", cell_num - 1, addr); } last_cell_type = WAS_VALUE; break; } /* Check if any referenced item has a valid address. */ switch (cell_type) { case WT_CELL_ADDR_DEL: case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: case WT_CELL_KEY_OVFL: case WT_CELL_VALUE_OVFL: if (!bm->addr_valid(bm, session, unpack->data, unpack->size)) goto eof; break; } /* * Remaining checks are for key order and prefix compression. * If this cell isn't a key, we're done, move to the next cell. * If this cell is an overflow item, instantiate the key and * compare it with the last key. Otherwise, we have to deal with * prefix compression. */ switch (cell_type) { case WT_CELL_KEY: break; case WT_CELL_KEY_OVFL: WT_ERR(__wt_dsk_cell_data_ref( session, dsk->type, unpack, current)); goto key_compare; default: /* Not a key -- continue with the next cell. */ continue; } /* * Prefix compression checks. * * Confirm the first non-overflow key on a page has a zero * prefix compression count. */ prefix = unpack->prefix; if (last_pfx->size == 0 && prefix != 0) WT_ERR_VRFY(session, "the %" PRIu32 " key on page at %s is the first " "non-overflow key on the page and has a non-zero " "prefix compression value", cell_num, addr); /* Confirm the prefix compression count is possible. */ if (cell_num > 1 && prefix > last->size) WT_ERR_VRFY(session, "key %" PRIu32 " on page at %s has a prefix " "compression count of %" PRIu32 ", larger than " "the length of the previous key, %" WT_SIZET_FMT, cell_num, addr, prefix, last->size); /* * If Huffman decoding required, unpack the cell to build the * key, then resolve the prefix. Else, we can do it faster * internally because we don't have to shuffle memory around as * much. */ if (huffman != NULL) { WT_ERR(__wt_dsk_cell_data_ref( session, dsk->type, unpack, current)); /* * If there's a prefix, make sure there's enough buffer * space, then shift the decoded data past the prefix * and copy the prefix into place. Take care with the * pointers: current->data may be pointing inside the * buffer. */ if (prefix != 0) { WT_ERR(__wt_buf_grow( session, current, prefix + current->size)); memmove((uint8_t *)current->mem + prefix, current->data, current->size); memcpy(current->mem, last->data, prefix); current->data = current->mem; current->size += prefix; } } else { /* * Get the cell's data/length and make sure we have * enough buffer space. */ WT_ERR(__wt_buf_init( session, current, prefix + unpack->size)); /* Copy the prefix then the data into place. */ if (prefix != 0) memcpy(current->mem, last->data, prefix); memcpy((uint8_t *)current->mem + prefix, unpack->data, unpack->size); current->size = prefix + unpack->size; } key_compare: /* * Compare the current key against the last key. * * Be careful about the 0th key on internal pages: we only store * the first byte and custom collators may not be able to handle * truncated keys. */ if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) || (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) { WT_ERR(__wt_compare( session, btree->collator, last, current, &cmp)); if (cmp >= 0) WT_ERR_VRFY(session, "the %" PRIu32 " and %" PRIu32 " keys on " "page at %s are incorrectly sorted", cell_num - 2, cell_num, addr); } /* * Swap the buffers: last always references the last key entry, * last_pfx and last_ovfl reference the last prefix-compressed * and last overflow key entries. Current gets pointed to the * buffer we're not using this time around, which is where the * next key goes. */ last = current; if (cell_type == WT_CELL_KEY) { current = last_pfx; last_pfx = last; } else { current = last_ovfl; last_ovfl = last; } WT_ASSERT(session, last != current); }
/* * __wt_schema_project_in -- * Given list of cursors and a projection, read columns from the * application into the dependent cursors. */ int __wt_schema_project_in(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, va_list ap) { WT_CURSOR *c; WT_DECL_ITEM(buf); WT_DECL_PACK(pack); WT_DECL_PACK_VALUE(pv); WT_PACK_VALUE old_pv; size_t len, offset, old_len; u_long arg; char *proj; uint8_t *p, *end; const uint8_t *next; p = end = NULL; /* -Wuninitialized */ /* Reset any of the buffers we will be setting. */ for (proj = (char *)proj_arg; *proj != '\0'; proj++) { arg = strtoul(proj, &proj, 10); if (*proj == WT_PROJ_KEY) { c = cp[arg]; WT_RET(__wt_buf_init(session, &c->key, 0)); } else if (*proj == WT_PROJ_VALUE) { c = cp[arg]; WT_RET(__wt_buf_init(session, &c->value, 0)); } } for (proj = (char *)proj_arg; *proj != '\0'; proj++) { arg = strtoul(proj, &proj, 10); switch (*proj) { case WT_PROJ_KEY: c = cp[arg]; if (WT_CURSOR_RECNO(c)) { c->key.data = &c->recno; c->key.size = sizeof(c->recno); WT_RET(__pack_init(session, &pack, "R")); } else WT_RET(__pack_init( session, &pack, c->key_format)); buf = &c->key; p = (uint8_t *)buf->data; end = p + buf->size; continue; case WT_PROJ_VALUE: c = cp[arg]; WT_RET(__pack_init(session, &pack, c->value_format)); buf = &c->value; p = (uint8_t *)buf->data; end = p + buf->size; continue; } /* We have to get a key or value before any operations. */ WT_ASSERT(session, buf != NULL); /* * Otherwise, the argument is a count, where a missing * count means a count of 1. */ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) { switch (*proj) { case WT_PROJ_SKIP: WT_RET(__pack_next(&pack, &pv)); /* * A nasty case: if we are inserting * out-of-order, we may reach the end of the * data. That's okay: we want to append in * that case, and we're positioned to do that. */ if (p == end) { /* Set up an empty value. */ WT_CLEAR(pv.u); if (pv.type == 'S' || pv.type == 's') pv.u.s = ""; WT_RET(__pack_size(session, &pv, &len)); WT_RET(__wt_buf_grow(session, buf, buf->size + len)); p = (uint8_t *)buf->mem + buf->size; WT_RET(__pack_write( session, &pv, &p, len)); buf->size += len; end = (uint8_t *)buf->mem + buf->size; } else if (*proj == WT_PROJ_SKIP) WT_RET(__unpack_read(session, &pv, (const uint8_t **)&p, (size_t)(end - p))); break; case WT_PROJ_NEXT: WT_RET(__pack_next(&pack, &pv)); WT_PACK_GET(session, pv, ap); /* FALLTHROUGH */ case WT_PROJ_REUSE: /* Read the item we're about to overwrite. */ next = p; if (p < end) { old_pv = pv; WT_RET(__unpack_read(session, &old_pv, &next, (size_t)(end - p))); } old_len = (size_t)(next - p); WT_RET(__pack_size(session, &pv, &len)); offset = WT_PTRDIFF(p, buf->mem); WT_RET(__wt_buf_grow(session, buf, buf->size + len)); p = (uint8_t *)buf->mem + offset; end = (uint8_t *)buf->mem + buf->size + len; /* Make room if we're inserting out-of-order. */ if (offset + old_len < buf->size) memmove(p + len, p + old_len, buf->size - (offset + old_len)); WT_RET(__pack_write(session, &pv, &p, len)); buf->size += len; break; default: WT_RET_MSG(session, EINVAL, "unexpected projection plan: %c", (int)*proj); } } } return (0); }
/* * __wt_txn_recover -- * Run recovery. */ int __wt_txn_recover(WT_CONNECTION_IMPL *conn) { WT_CURSOR *metac; WT_DECL_RET; WT_RECOVERY r; WT_SESSION_IMPL *session; struct WT_RECOVERY_FILE *metafile; char *config; int was_backup; WT_CLEAR(r); INIT_LSN(&r.ckpt_lsn); was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0; /* We need a real session for recovery. */ WT_RET(__wt_open_session(conn, NULL, NULL, &session)); F_SET(session, WT_SESSION_NO_LOGGING); r.session = session; WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); WT_ERR(__wt_metadata_cursor(session, NULL, &metac)); metafile = &r.files[WT_METAFILE_ID]; metafile->c = metac; /* * First, do a pass through the log to recover the metadata, and * establish the last checkpoint LSN. Skip this when opening a hot * backup: we already have the correct metadata in that case. */ if (!was_backup) { r.metadata_only = 1; if (IS_INIT_LSN(&metafile->ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); else WT_ERR(__wt_log_scan(session, &metafile->ckpt_lsn, 0, __txn_log_recover, &r)); WT_ASSERT(session, LOG_CMP(&r.ckpt_lsn, &conn->log->first_lsn) >= 0); } /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); /* * We no longer need the metadata cursor: close it to avoid pinning any * resources that could block eviction during recovery. */ r.files[0].c = NULL; WT_ERR(metac->close(metac)); /* * Now, recover all the files apart from the metadata. * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = 0; WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, "Main recovery loop: starting at %u/%" PRIuMAX, r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset)); if (IS_INIT_LSN(&r.ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); else WT_ERR(__wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); conn->next_file_id = r.max_fileid; /* * If recovery ran successfully forcibly log a checkpoint so the next * open is fast and keep the metadata up to date with the checkpoint * LSN and archiving. */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); WT_TRET(session->iface.close(&session->iface, NULL)); return (ret); }
/* * __wt_schema_project_slice -- * Given list of cursors and a projection, read columns from the * a raw buffer. */ int __wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp, const char *proj_arg, bool key_only, const char *vformat, WT_ITEM *value) { WT_CURSOR *c; WT_DECL_ITEM(buf); WT_DECL_PACK(pack); WT_DECL_PACK_VALUE(pv); WT_DECL_PACK_VALUE(vpv); WT_PACK vpack; u_long arg; char *proj; uint8_t *end, *p; const uint8_t *next, *vp, *vend; size_t len, offset, old_len; bool skip; p = end = NULL; /* -Wuninitialized */ WT_RET(__pack_init(session, &vpack, vformat)); vp = value->data; vend = vp + value->size; /* Reset any of the buffers we will be setting. */ for (proj = (char *)proj_arg; *proj != '\0'; proj++) { arg = strtoul(proj, &proj, 10); if (*proj == WT_PROJ_KEY) { c = cp[arg]; WT_RET(__wt_buf_init(session, &c->key, 0)); } else if (*proj == WT_PROJ_VALUE && !key_only) { c = cp[arg]; WT_RET(__wt_buf_init(session, &c->value, 0)); } } skip = key_only; for (proj = (char *)proj_arg; *proj != '\0'; proj++) { arg = strtoul(proj, &proj, 10); switch (*proj) { case WT_PROJ_KEY: skip = false; c = cp[arg]; if (WT_CURSOR_RECNO(c)) { c->key.data = &c->recno; c->key.size = sizeof(c->recno); WT_RET(__pack_init(session, &pack, "R")); } else WT_RET(__pack_init( session, &pack, c->key_format)); buf = &c->key; p = (uint8_t *)buf->data; end = p + buf->size; continue; case WT_PROJ_VALUE: skip = key_only; if (skip) continue; c = cp[arg]; WT_RET(__pack_init(session, &pack, c->value_format)); buf = &c->value; p = (uint8_t *)buf->data; end = p + buf->size; continue; } /* We have to get a key or value before any operations. */ WT_ASSERT(session, skip || buf != NULL); /* * Otherwise, the argument is a count, where a missing * count means a count of 1. */ for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) { switch (*proj) { case WT_PROJ_SKIP: if (skip) break; WT_RET(__pack_next(&pack, &pv)); /* * A nasty case: if we are inserting * out-of-order, append a zero value to keep * the buffer in the correct format. */ if (p == end) { /* Set up an empty value. */ WT_CLEAR(pv.u); if (pv.type == 'S' || pv.type == 's') pv.u.s = ""; WT_RET(__pack_size(session, &pv, &len)); WT_RET(__wt_buf_grow(session, buf, buf->size + len)); p = (uint8_t *)buf->data + buf->size; WT_RET(__pack_write( session, &pv, &p, len)); end = p; buf->size += len; } else WT_RET(__unpack_read(session, &pv, (const uint8_t **)&p, (size_t)(end - p))); break; case WT_PROJ_NEXT: WT_RET(__pack_next(&vpack, &vpv)); WT_RET(__unpack_read(session, &vpv, &vp, (size_t)(vend - vp))); /* FALLTHROUGH */ case WT_PROJ_REUSE: if (skip) break; /* * Read the item we're about to overwrite. * * There is subtlety here: the value format * may not exactly match the cursor's format. * In particular, we need lengths with raw * columns in the middle of a packed struct, * but not if they are at the end of a struct. */ WT_RET(__pack_next(&pack, &pv)); next = p; if (p < end) WT_RET(__unpack_read(session, &pv, &next, (size_t)(end - p))); old_len = (size_t)(next - p); /* Make sure the types are compatible. */ WT_ASSERT(session, __wt_tolower((u_char)pv.type) == __wt_tolower((u_char)vpv.type)); pv.u = vpv.u; WT_RET(__pack_size(session, &pv, &len)); offset = WT_PTRDIFF(p, buf->data); /* * Avoid growing the buffer if the value fits. * This is not just a performance issue: it * covers the case of record number keys, which * have to be written to cursor->recno. */ if (len > old_len) WT_RET(__wt_buf_grow(session, buf, buf->size + len - old_len)); p = (uint8_t *)buf->data + offset; /* Make room if we're inserting out-of-order. */ if (offset + old_len < buf->size) memmove(p + len, p + old_len, buf->size - (offset + old_len)); WT_RET(__pack_write(session, &pv, &p, len)); buf->size += len - old_len; end = (uint8_t *)buf->data + buf->size; break; default: WT_RET_MSG(session, EINVAL, "unexpected projection plan: %c", (int)*proj); } } } return (0); }
/* * __wt_struct_plan -- * Given a table cursor containing a complete table, build the "projection * plan" to distribute the columns to dependent stores. A string * representing the plan will be appended to the plan buffer. */ int __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns, size_t len, int value_only, WT_ITEM *plan) { WT_BTREE *saved_btree; WT_CONFIG conf; WT_CONFIG_ITEM k, v; WT_DECL_RET; int cg, col, current_cg, current_col, start_cg, start_col; int i, have_it; char coltype, current_coltype; saved_btree = session->btree; start_cg = start_col = -1; /* -Wuninitialized */ /* Work through the value columns by skipping over the key columns. */ WT_ERR(__wt_config_initn(session, &conf, columns, len)); if (value_only) for (i = 0; i < table->nkey_columns; i++) WT_ERR(__wt_config_next(&conf, &k, &v)); current_cg = cg = 0; current_col = col = INT_MAX; current_coltype = coltype = WT_PROJ_KEY; /* Keep lint quiet. */ while (__wt_config_next(&conf, &k, &v) == 0) { have_it = 0; while (__find_next_col(session, table, &k, &cg, &col, &coltype) == 0 && (!have_it || cg != start_cg || col != start_col)) { /* * First we move to the column. If that is in a * different column group to the last column we * accessed, or before the last column in the same * column group, or moving from the key to the value, * we need to switch column groups or rewind. */ if (current_cg != cg || current_col > col || current_coltype != coltype) { WT_ASSERT(session, !value_only || coltype == WT_PROJ_VALUE); WT_ERR(__wt_buf_catfmt( session, plan, "%d%c", cg, coltype)); /* * Set the current column group and column * within the table. */ current_cg = cg; current_col = 0; current_coltype = coltype; } /* Now move to the column we want. */ if (current_col < col) { if (col - current_col > 1) WT_ERR(__wt_buf_catfmt(session, plan, "%d", col - current_col)); WT_ERR(__wt_buf_catfmt(session, plan, "%c", WT_PROJ_SKIP)); } /* * Now copy the value in / out. In the common case, * where each value is used in one column, we do a * "next" operation. If the value is used again, we do * a "reuse" operation to avoid making another copy. */ if (!have_it) { WT_ERR(__wt_buf_catfmt(session, plan, "%c", WT_PROJ_NEXT)); start_cg = cg; start_col = col; have_it = 1; } else WT_ERR(__wt_buf_catfmt(session, plan, "%c", WT_PROJ_REUSE)); current_col = col + 1; } } err: session->btree = saved_btree; return (ret); }
/* * __wt_col_modify -- * Column-store delete, insert, and update. */ int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove) { WT_BTREE *btree; WT_DECL_RET; WT_INSERT *ins; WT_INSERT_HEAD *ins_head, **ins_headp; WT_ITEM _value; WT_PAGE *page; WT_UPDATE *old_upd; size_t ins_size, upd_size; u_int i, skipdepth; int append, logged; btree = cbt->btree; ins = NULL; page = cbt->ref->page; append = logged = 0; /* This code expects a remove to have a NULL value. */ if (is_remove) { if (btree->type == BTREE_COL_FIX) { value = &_value; value->data = ""; value->size = 1; } else value = NULL; } else { /* * There's some chance the application specified a record past * the last record on the page. If that's the case, and we're * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the * append list, not the update list. In addition, a recno of 0 * implies an append operation, we're allocating a new row. */ if (recno == 0 || recno > (btree->type == BTREE_COL_VAR ? __col_var_last_recno(page) : __col_fix_last_recno(page))) append = 1; } /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); /* * Delete, insert or update a column-store entry. * * If modifying a previously modified record, create a new WT_UPDATE * entry and have a serialized function link it into an existing * WT_INSERT entry's WT_UPDATE list. * * Else, allocate an insert array as necessary, build a WT_INSERT and * WT_UPDATE structure pair, and call a serialized function to insert * the WT_INSERT structure. */ if (cbt->compare == 0 && cbt->ins != NULL) { /* * If we are restoring updates that couldn't be evicted, the * key must not exist on the new page. */ WT_ASSERT(session, upd == NULL); /* Make sure the update can proceed. */ WT_ERR(__wt_txn_update_check( session, old_upd = cbt->ins->upd)); /* Allocate a WT_UPDATE structure and transaction ID. */ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, upd)); logged = 1; /* Avoid a data copy in WT_CURSOR.update. */ cbt->modify_update = upd; /* * Point the new WT_UPDATE item to the next element in the list. * If we get it right, the serialization function lock acts as * our memory barrier to flush this write. */ upd->next = old_upd; /* Serialize the update. */ WT_ERR(__wt_update_serial( session, page, &cbt->ins->upd, &upd, upd_size)); } else { /* Allocate the append/update list reference as necessary. */ if (append) { WT_PAGE_ALLOC_AND_SWAP(session, page, page->modify->mod_append, ins_headp, 1); ins_headp = &page->modify->mod_append[0]; } else if (page->type == WT_PAGE_COL_FIX) { WT_PAGE_ALLOC_AND_SWAP(session, page, page->modify->mod_update, ins_headp, 1); ins_headp = &page->modify->mod_update[0]; } else { WT_PAGE_ALLOC_AND_SWAP(session, page, page->modify->mod_update, ins_headp, page->pg_var_entries); ins_headp = &page->modify->mod_update[cbt->slot]; } /* Allocate the WT_INSERT_HEAD structure as necessary. */ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1); ins_head = *ins_headp; /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(session); /* * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and * update the cursor to reference it (the WT_INSERT_HEAD might * be allocated, the WT_INSERT was allocated). */ WT_ERR(__col_insert_alloc( session, recno, skipdepth, &ins, &ins_size)); cbt->ins_head = ins_head; cbt->ins = ins; if (upd == NULL) { WT_ERR( __wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, upd)); logged = 1; /* Avoid a data copy in WT_CURSOR.update. */ cbt->modify_update = upd; } else upd_size = __wt_update_list_memsize(upd); ins->upd = upd; ins_size += upd_size; /* * If there was no insert list during the search, or there was * no search because the record number has not been allocated * yet, the cursor's information cannot be correct, search * couldn't have initialized it. * * Otherwise, point the new WT_INSERT item's skiplist to the * next elements in the insert list (which we will check are * still valid inside the serialization function). * * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; } else for (i = 0; i < skipdepth; i++) ins->next[i] = cbt->next_stack[i]; /* Append or insert the WT_INSERT structure. */ if (append) WT_ERR(__wt_col_append_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, &cbt->recno, skipdepth)); else WT_ERR(__wt_insert_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, skipdepth)); } /* If the update was successful, add it to the in-memory log. */ if (logged) WT_ERR(__wt_txn_log_op(session, cbt)); if (0) { err: /* * Remove the update from the current transaction, so we don't * try to modify it on rollback. */ if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); __wt_free(session, upd); } return (ret); }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; WT_PAGE *page; int busy, force_attempts, oldgen; for (force_attempts = oldgen = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, attempt to read it. * Make sure there is space in the cache. */ WT_RET(__wt_cache_full_check(session)); WT_RET(__wt_cache_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* FALLTHROUGH */ case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* The page is busy -- wait. */ break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory: get a hazard pointer, update * the page's LRU and return. The expected reason we * can't get a hazard pointer is because the page is * being evicted; yield and try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) break; page = ref->page; WT_ASSERT(session, page != NULL); /* Forcibly evict pages that are too big. */ if (!LF_ISSET(WT_READ_NO_EVICT) && force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; WT_RET(__wt_page_release(session, ref, flags)); break; } /* Check if we need an autocommit transaction. */ if ((ret = __wt_txn_autocommit_check(session)) != 0) { WT_TRET(__wt_hazard_clear(session, page)); return (ret); } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); return (0); WT_ILLEGAL_VALUE(session); } /* We failed to get the page -- yield before retrying. */ __wt_yield(); } }
/* * __curjoin_iter_set_entry -- * Set the current entry for an iterator. */ static int __curjoin_iter_set_entry(WT_CURSOR_JOIN_ITER *iter, u_int entry_pos) { WT_CURSOR *c, *to_dup; WT_CURSOR_JOIN *cjoin, *topjoin; WT_CURSOR_JOIN_ENTRY *entry; WT_DECL_RET; WT_SESSION_IMPL *session; size_t size; const char *raw_cfg[] = { WT_CONFIG_BASE( iter->session, WT_SESSION_open_cursor), "raw", NULL }; const char *def_cfg[] = { WT_CONFIG_BASE( iter->session, WT_SESSION_open_cursor), NULL }; const char **config; char *uri; session = iter->session; cjoin = iter->cjoin; uri = NULL; entry = iter->entry = &cjoin->entries[entry_pos]; iter->positioned = false; iter->entry_pos = entry_pos; iter->end_pos = 0; iter->is_equal = (entry->ends_next == 1 && WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_EQ); iter->end_skip = (entry->ends_next > 0 && WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_GE) ? 1 : 0; iter->end_count = WT_MIN(1, entry->ends_next); if (F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) { iter->entry_count = cjoin->entries_next; if (iter->is_equal) iter->end_count = entry->ends_next; } else iter->entry_count = 1; WT_ASSERT(iter->session, iter->entry_pos < iter->entry_count); entry->stats.iterated = 0; if (entry->subjoin == NULL) { for (topjoin = iter->cjoin; topjoin->parent != NULL; topjoin = topjoin->parent) ; to_dup = entry->ends[0].cursor; if (F_ISSET((WT_CURSOR *)topjoin, WT_CURSTD_RAW)) config = &raw_cfg[0]; else config = &def_cfg[0]; size = strlen(to_dup->internal_uri) + 3; WT_ERR(__wt_calloc(session, size, 1, &uri)); WT_ERR(__wt_snprintf(uri, size, "%s()", to_dup->internal_uri)); if ((c = iter->cursor) == NULL || strcmp(c->uri, uri) != 0) { iter->cursor = NULL; if (c != NULL) WT_ERR(c->close(c)); WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)topjoin, config, &iter->cursor)); } WT_ERR(__wt_cursor_dup_position(to_dup, iter->cursor)); } else if (iter->cursor != NULL) { WT_ERR(iter->cursor->close(iter->cursor)); iter->cursor = NULL; } err: __wt_free(session, uri); return (ret); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int discard) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_DSTAT_INCR(session, cursor_prev); flags = WT_TREE_SKIP_INTL | WT_TREE_PREV; /* Tree walk flags. */ if (discard) LF_SET(WT_TREE_DISCARD); retry: WT_RET(__cursor_func_init(cbt, 0)); __cursor_position_clear(cbt); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt, 0); /* * If this is a modification, we're about to read information from the * page, save the write generation. */ page = cbt->page; if (discard && page != NULL) { WT_ERR(__wt_page_modify_init(session, page)); WT_ORDERED_READ(cbt->write_gen, page->modify->write_gen); } /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ for (newpage = 0;; newpage = 1) { if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = 1; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } cbt->page = NULL; WT_ERR(__wt_tree_walk(session, &page, flags)); WT_ERR_TEST(page == NULL, WT_NOTFOUND); WT_ASSERT(session, page->type != WT_PAGE_COL_INT && page->type != WT_PAGE_ROW_INT); cbt->page = page; /* Initialize the page's modification information */ if (discard) { WT_ERR(__wt_page_modify_init(session, page)); WT_ORDERED_READ( cbt->write_gen, page->modify->write_gen); } /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); } err: if (ret == WT_RESTART) goto retry; WT_TRET(__cursor_func_resolve(cbt, ret)); return (ret); }
/* * __curjoin_entry_member -- * Do a membership check for a particular index that was joined, * if not a member, returns WT_NOTFOUND. */ static int __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, WT_ITEM *key, WT_CURSOR_JOIN_ITER *iter) { WT_CURJOIN_EXTRACTOR extract_cursor; WT_CURSOR *c; WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __wt_cursor_compare_notsup, /* compare */ __wt_cursor_equals_notsup, /* equals */ __wt_cursor_notsup, /* next */ __wt_cursor_notsup, /* prev */ __wt_cursor_notsup, /* reset */ __wt_cursor_notsup, /* search */ __wt_cursor_search_near_notsup, /* search-near */ __curjoin_extract_insert, /* insert */ __wt_cursor_modify_notsup, /* modify */ __wt_cursor_notsup, /* update */ __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ __wt_cursor_notsup, /* cache */ __wt_cursor_reopen_notsup, /* reopen */ __wt_cursor_notsup); /* close */ WT_DECL_RET; WT_INDEX *idx; WT_ITEM v; bool bloom_found; if (entry->subjoin == NULL && iter != NULL && (iter->end_pos + iter->end_skip >= entry->ends_next || (iter->end_skip > 0 && F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)))) return (0); /* no checks to make */ entry->stats.membership_check++; bloom_found = false; if (entry->bloom != NULL) { /* * If the item is not in the Bloom filter, we return * immediately, otherwise, we still may need to check the * long way, since it may be a false positive. * * If we don't own the Bloom filter, we must be sharing one * in a previous entry. So the shared filter has already * been checked and passed, we don't need to check it again. * We'll still need to check the long way. */ if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM)) WT_ERR(__wt_bloom_inmem_get(entry->bloom, key)); if (F_ISSET(entry, WT_CURJOIN_ENTRY_FALSE_POSITIVES)) return (0); bloom_found = true; } if (entry->subjoin != NULL) { WT_ASSERT(session, iter == NULL || entry->subjoin == iter->child->cjoin); ret = __curjoin_entries_in_range(session, entry->subjoin, key, iter == NULL ? NULL : iter->child); if (iter != NULL && WT_CURJOIN_ITER_CONSUMED(iter->child)) { WT_ERR(__curjoin_iter_bump(iter)); ret = WT_NOTFOUND; } return (ret); } if (entry->index != NULL) { /* * If this entry is used by the iterator, then we already * have the index key, and we won't have to do any * extraction either. */ if (iter != NULL && entry == iter->entry) WT_ITEM_SET(v, iter->idxkey); else { memset(&v, 0, sizeof(v)); /* Keep lint quiet. */ c = entry->main; c->set_key(c, key); entry->stats.main_access++; if ((ret = c->search(c)) == 0) ret = c->get_value(c, &v); else if (ret == WT_NOTFOUND) { __wt_err(session, ret, "main table for join is missing entry"); ret = WT_ERROR; } WT_TRET(c->reset(c)); WT_ERR(ret); } } else WT_ITEM_SET(v, *key); if ((idx = entry->index) != NULL && idx->extractor != NULL && (iter == NULL || entry != iter->entry)) { WT_CLEAR(extract_cursor); extract_cursor.iface = iface; extract_cursor.iface.session = &session->iface; extract_cursor.iface.key_format = idx->exkey_format; extract_cursor.ismember = false; extract_cursor.entry = entry; WT_ERR(idx->extractor->extract(idx->extractor, &session->iface, key, &v, &extract_cursor.iface)); __wt_buf_free(session, &extract_cursor.iface.key); __wt_buf_free(session, &extract_cursor.iface.value); if (!extract_cursor.ismember) WT_ERR(WT_NOTFOUND); } else WT_ERR(__curjoin_entry_in_range(session, entry, &v, iter)); if (0) { err: if (ret == WT_NOTFOUND && bloom_found) entry->stats.bloom_false_positive++; } return (ret); }
/* * __wt_txn_get_snapshot -- * Allocate a snapshot. */ void __wt_txn_get_snapshot(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; uint64_t current_id, id; uint64_t prev_oldest_id, snap_min; uint32_t i, n, session_cnt; int32_t count; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); /* * We're going to scan. Increment the count of scanners to prevent the * oldest ID from moving forwards. Spin if the count is negative, * which indicates that some thread is moving the oldest ID forwards. */ do { if ((count = txn_global->scan_count) < 0) WT_PAUSE(); } while (count < 0 || !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1)); current_id = snap_min = txn_global->current; prev_oldest_id = txn_global->oldest_id; /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { txn_state->snap_min = current_id; __txn_sort_snapshot(session, 0, current_id); /* Check that the oldest ID has not moved in the meantime. */ if (prev_oldest_id == txn_global->oldest_id) { WT_ASSERT(session, txn_global->scan_count > 0); (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); return; } } /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Build our snapshot of any concurrent transaction IDs. * * Ignore: * - Our own ID: we always read our own updates. * - The ID if it is older than the oldest ID we saw. This * can happen if we race with a thread that is allocating * an ID -- the ID will not be used because the thread will * keep spinning until it gets a valid one. */ if (s != txn_state && (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id)) { txn->snapshot[n++] = id; if (WT_TXNID_LT(id, snap_min)) snap_min = id; } } /* * If we got a new snapshot, update the published snap_min for this * session. */ WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, snap_min)); WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->snap_min = snap_min; WT_ASSERT(session, txn_global->scan_count > 0); (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); __txn_sort_snapshot(session, n, current_id); }
/* * __curjoin_init_next -- * Initialize the cursor join when the next function is first called. */ static int __curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, bool iterable) { WT_BLOOM *bloom; WT_CURSOR *origcur; WT_CURSOR_JOIN_ENDPOINT *end; WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2; WT_DECL_RET; size_t size; uint32_t f, k; char *mainbuf; const char *def_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), NULL }; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; const char **config, *proj, *urimain; mainbuf = NULL; if (cjoin->entries_next == 0) WT_RET_MSG(session, EINVAL, "join cursor has not yet been joined with any other " "cursors"); /* Get a consistent view of our subordinate cursors if appropriate. */ __wt_txn_cursor_op(session); if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW)) config = &raw_cfg[0]; else config = &def_cfg[0]; urimain = cjoin->table->iface.name; if ((proj = cjoin->projection) != NULL) { size = strlen(urimain) + strlen(proj) + 1; WT_ERR(__wt_calloc(session, size, 1, &mainbuf)); WT_ERR(__wt_snprintf(mainbuf, size, "%s%s", urimain, proj)); urimain = mainbuf; } WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config, &cjoin->main)); jeend = &cjoin->entries[cjoin->entries_next]; for (je = cjoin->entries; je < jeend; je++) { if (je->subjoin != NULL) { WT_ERR(__curjoin_init_next(session, je->subjoin, iterable)); continue; } __wt_stat_join_init_single(&je->stats); /* * For a single compare=le/lt endpoint in any entry that may * be iterated, construct a companion compare=ge endpoint * that will actually be iterated. */ if (iterable && je->ends_next == 1 && F_ISSET(&je->ends[0], WT_CURJOIN_END_LT)) { origcur = je->ends[0].cursor; WT_ERR(__curjoin_insert_endpoint(session, je, 0, &end)); WT_ERR(__wt_open_cursor(session, origcur->uri, (WT_CURSOR *)cjoin, F_ISSET(origcur, WT_CURSTD_RAW) ? raw_cfg : def_cfg, &end->cursor)); end->flags = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ | WT_CURJOIN_END_OWN_CURSOR; WT_ERR(end->cursor->next(end->cursor)); F_CLR(je, WT_CURJOIN_ENTRY_DISJUNCTION); } for (end = &je->ends[0]; end < &je->ends[je->ends_next]; end++) WT_ERR(__curjoin_endpoint_init_key(session, je, end)); /* * Do any needed Bloom filter initialization. Ignore Bloom * filters for entries that will be iterated. They won't * help since these entries either don't need an inclusion * check or are doing any needed check during the iteration. */ if (!iterable && F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) { if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED) WT_ERR_MSG(session, EINVAL, "join cursors with Bloom filters cannot be " "used with read-uncommitted isolation"); if (je->bloom == NULL) { /* * Look for compatible filters to be shared, * pick compatible numbers for bit counts * and number of hashes. */ f = je->bloom_bit_count; k = je->bloom_hash_count; for (je2 = je + 1; je2 < jeend; je2++) if (F_ISSET(je2, WT_CURJOIN_ENTRY_BLOOM) && je2->count == je->count) { f = WT_MAX( je2->bloom_bit_count, f); k = WT_MAX( je2->bloom_hash_count, k); } je->bloom_bit_count = f; je->bloom_hash_count = k; WT_ERR(__wt_bloom_create(session, NULL, NULL, je->count, f, k, &je->bloom)); F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM); WT_ERR(__curjoin_init_bloom(session, cjoin, je, je->bloom)); /* * Share the Bloom filter, making all * config info consistent. */ for (je2 = je + 1; je2 < jeend; je2++) if (F_ISSET(je2, WT_CURJOIN_ENTRY_BLOOM) && je2->count == je->count) { WT_ASSERT(session, je2->bloom == NULL); je2->bloom = je->bloom; je2->bloom_bit_count = f; je2->bloom_hash_count = k; } } else { /* * Create a temporary filter that we'll * merge into the shared one. The Bloom * parameters of the two filters must match. */ WT_ERR(__wt_bloom_create(session, NULL, NULL, je->count, je->bloom_bit_count, je->bloom_hash_count, &bloom)); WT_ERR(__curjoin_init_bloom(session, cjoin, je, bloom)); WT_ERR(__wt_bloom_intersection(je->bloom, bloom)); WT_ERR(__wt_bloom_close(bloom)); } } if (!F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) iterable = false; } F_SET(cjoin, WT_CURJOIN_INITIALIZED); err: __wt_free(session, mainbuf); return (ret); }
/* * __wt_txn_update_oldest -- * Sweep the running transactions to update the oldest ID required. * !!! * If a data-source is calling the WT_EXTENSION_API.transaction_oldest * method (for the oldest transaction ID not yet visible to a running * transaction), and then comparing that oldest ID against committed * transactions to see if updates for a committed transaction are still * visible to running transactions, the oldest transaction ID may be * the same as the last committed transaction ID, if the transaction * state wasn't refreshed after the last transaction committed. Push * past the last committed transaction. */ void __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force) { WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; uint64_t current_id, id, last_running, oldest_id, prev_oldest_id; uint32_t i, session_cnt; int32_t count; bool last_running_moved; conn = S2C(session); txn_global = &conn->txn_global; retry: current_id = last_running = txn_global->current; oldest_session = NULL; prev_oldest_id = txn_global->oldest_id; /* * For pure read-only workloads, or if the update isn't forced and the * oldest ID isn't too far behind, avoid scanning. */ if (prev_oldest_id == current_id || (!force && WT_TXNID_LT(current_id, prev_oldest_id + 100))) return; /* * We're going to scan. Increment the count of scanners to prevent the * oldest ID from moving forwards. Spin if the count is negative, * which indicates that some thread is moving the oldest ID forwards. */ do { if ((count = txn_global->scan_count) < 0) WT_PAUSE(); } while (count < 0 || !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1)); /* The oldest ID cannot change until the scan count goes to zero. */ prev_oldest_id = txn_global->oldest_id; current_id = oldest_id = last_running = txn_global->current; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Update the oldest ID. * * Ignore: IDs older than the oldest ID we saw. This can happen * if we race with a thread that is allocating an ID -- the ID * will not be used because the thread will keep spinning until * it gets a valid one. */ if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && WT_TXNID_LT(id, last_running)) last_running = id; /* * !!! * Note: Don't ignore snap_min values older than the previous * oldest ID. Read-uncommitted operations publish snap_min * values without incrementing scan_count to protect the global * table. See the comment in __wt_txn_cursor_op for * more details. */ if ((id = s->snap_min) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) { oldest_id = id; oldest_session = &conn->sessions[i]; } } if (WT_TXNID_LT(last_running, oldest_id)) oldest_id = last_running; /* The oldest ID can't move past any named snapshots. */ if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; /* Update the last running ID. */ last_running_moved = WT_TXNID_LT(txn_global->last_running, last_running); /* Update the oldest ID. */ if (WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) { /* * We know we want to update. Check if we're racing. */ if (__wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) { WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, last_running)) last_running = id; if ((id = s->snap_min) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; } if (WT_TXNID_LT(last_running, oldest_id)) oldest_id = last_running; #ifdef HAVE_DIAGNOSTIC /* * Make sure the ID doesn't move past any named * snapshots. * * Don't include the read/assignment in the assert * statement. Coverity complains if there are * assignments only done in diagnostic builds, and * when the read is from a volatile. */ id = txn_global->nsnap_oldest_id; WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); #endif if (WT_TXNID_LT(txn_global->last_running, last_running)) txn_global->last_running = last_running; if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; WT_ASSERT(session, txn_global->scan_count == -1); txn_global->scan_count = 0; } else { /* * We wanted to update the oldest ID but we're racing * another thread. Retry if this is a forced update. */ WT_ASSERT(session, txn_global->scan_count > 0); (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); if (force) { __wt_yield(); goto retry; } } } else { if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && current_id - oldest_id > 10000 && oldest_session != NULL) { (void)__wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %d [%s]" " with snap_min %" PRIu64 "\n", oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min); } WT_ASSERT(session, txn_global->scan_count > 0); (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); } }
/* * __wt_evict_file -- * Discard pages for a specific file. */ int __wt_evict_file(WT_SESSION_IMPL *session, int syncop) { WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; bool evict_reset; /* * We need exclusive access to the file -- disable ordinary eviction * and drain any blocks already queued. */ WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset)); /* Make sure the oldest transaction ID is up-to-date. */ __wt_txn_update_oldest(session, true); /* Walk the tree, discarding pages. */ next_ref = NULL; WT_ERR(__wt_tree_walk(session, &next_ref, NULL, WT_READ_CACHE | WT_READ_NO_EVICT)); while ((ref = next_ref) != NULL) { page = ref->page; /* * Eviction can fail when a page in the evicted page's subtree * switches state. For example, if we don't evict a page marked * empty, because we expect it to be merged into its parent, it * might no longer be empty after it's reconciled, in which case * eviction of its parent would fail. We can either walk the * tree multiple times (until it's finally empty), or reconcile * each page to get it to its final state before considering if * it's an eviction target or will be merged into its parent. * * Don't limit this test to any particular page type, that tends * to introduce bugs when the reconciliation of other page types * changes, and there's no advantage to doing so. * * Eviction can also fail because an update cannot be written. * If sessions have disjoint sets of files open, updates in a * no-longer-referenced file may not yet be globally visible, * and the write will fail with EBUSY. Our caller handles that * error, retrying later. */ if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING)); /* * We can't evict the page just returned to us (it marks our * place in the tree), so move the walk to one page ahead of * the page being evicted. Note, we reconciled the returned * page first: if reconciliation of that page were to change * the shape of the tree, and we did the next walk call before * the reconciliation, the next walk call could miss a page in * the tree. */ WT_ERR(__wt_tree_walk(session, &next_ref, NULL, WT_READ_CACHE | WT_READ_NO_EVICT)); switch (syncop) { case WT_SYNC_CLOSE: /* * Evict the page. */ WT_ERR(__wt_evict(session, ref, 1)); break; case WT_SYNC_DISCARD: /* * Dead handles may reference dirty pages; clean the * page, both to keep statistics correct, and to let * the page-discard function assert no dirty page is * ever discarded. */ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) __wt_page_modify_clear(session, page); WT_ASSERT(session, F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || __wt_page_can_evict(session, ref, false, NULL)); __wt_evict_page_clean_update(session, ref, 1); break; WT_ILLEGAL_VALUE_ERR(session); } } if (0) { err: /* On error, clear any left-over tree walk. */ if (next_ref != NULL) WT_TRET(__wt_page_release( session, next_ref, WT_READ_NO_EVICT)); } if (evict_reset) __wt_evict_file_exclusive_off(session); return (ret); }
/* * __wt_txn_commit -- * Commit the current transaction. */ int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN *txn; WT_TXN_OP *op; u_int i; txn = &session->txn; conn = S2C(session); WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0); if (!F_ISSET(txn, WT_TXN_RUNNING)) WT_RET_MSG(session, EINVAL, "No transaction is active"); /* * The default sync setting is inherited from the connection, but can * be overridden by an explicit "sync" setting for this transaction. */ WT_RET(__wt_config_gets_def(session, cfg, "sync", 0, &cval)); /* * If the user chose the default setting, check whether sync is enabled * for this transaction (either inherited or via begin_transaction). * If sync is disabled, clear the field to avoid the log write being * flushed. * * Otherwise check for specific settings. We don't need to check for * "on" because that is the default inherited from the connection. If * the user set anything in begin_transaction, we only override with an * explicit setting. */ if (cval.len == 0) { if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) && !F_ISSET(txn, WT_TXN_SYNC_SET)) txn->txn_logsync = 0; } else { /* * If the caller already set sync on begin_transaction then * they should not be using sync on commit_transaction. * Flag that as an error. */ if (F_ISSET(txn, WT_TXN_SYNC_SET)) WT_RET_MSG(session, EINVAL, "Sync already set during begin_transaction."); if (WT_STRING_MATCH("background", cval.str, cval.len)) txn->txn_logsync = WT_LOG_BACKGROUND; else if (WT_STRING_MATCH("off", cval.str, cval.len)) txn->txn_logsync = 0; /* * We don't need to check for "on" here because that is the * default to inherit from the connection setting. */ } /* Commit notification. */ if (txn->notify != NULL) WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session, txn->id, 1)); /* If we are logging, write a commit log record. */ if (ret == 0 && txn->mod_count > 0 && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && !F_ISSET(session, WT_SESSION_NO_LOGGING)) { /* * We are about to block on I/O writing the log. * Release our snapshot in case it is keeping data pinned. * This is particularly important for checkpoints. */ __wt_txn_release_snapshot(session); ret = __wt_txn_log_commit(session, cfg); } /* * If anything went wrong, roll back. * * !!! * Nothing can fail after this point. */ if (ret != 0) { WT_TRET(__wt_txn_rollback(session, cfg)); return (ret); } /* Free memory associated with updates. */ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) __wt_txn_op_free(session, op); txn->mod_count = 0; /* * We are about to release the snapshot: copy values into any * positioned cursors so they don't point to updates that could be * freed once we don't have a transaction ID pinned. */ if (session->ncursors > 0) WT_RET(__wt_session_copy_values(session)); __wt_txn_release(session); return (0); }
/* * __wt_block_write_off -- * Write a buffer into a block, returning the block's addr/size and * checksum. */ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; off_t offset; uint32_t align_size; blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize * boundary, this is one of the reasons the btree layer must find out * from the block-manager layer the maximum size of the eventual write. */ align_size = (uint32_t)WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Set the disk size so we don't have to incrementally read blocks * during salvage. */ blk->disk_size = align_size; /* * Update the block's checksum: if our caller specifies, checksum the * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP * bytes. The assumption is applications with good compression support * turn off checksums and assume corrupted blocks won't decompress * correctly. However, if compression failed to shrink the block, the * block wasn't compressed, in which case our caller will tell us to * checksum the data to detect corruption. If compression succeeded, * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes * because they're not compressed, both to give salvage a quick test * of whether a block is useful and to give us a test so we don't lose * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. */ blk->flags = 0; if (data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->cksum = 0; blk->cksum = __wt_cksum( buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); if (!locked) __wt_spin_lock(session, &block->live_lock); ret = __wt_block_alloc(session, block, &offset, (off_t)align_size); if (!locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); #if defined(HAVE_POSIX_FALLOCATE) || defined(HAVE_FTRUNCATE) /* * Extend the file in chunks. We aren't holding a lock and we'd prefer * to limit the number of threads extending the file at the same time, * so choose the one thread that's crossing the extended boundary. We * don't extend newly created files, and it's theoretically possible we * might wait so long our extension of the file is passed by another * thread writing single blocks, that's why there's a check in case the * extended file size becomes too small: if the file size catches up, * every thread will try to extend it. */ if (fh->extend_len != 0 && (fh->extend_size <= fh->size || (offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + align_size >= fh->extend_size))) { fh->extend_size = offset + fh->extend_len * 2; #if defined(HAVE_POSIX_FALLOCATE) if ((ret = posix_fallocate(fh->fd, offset, fh->extend_len * 2)) != 0) WT_RET_MSG( session, ret, "%s: posix_fallocate", fh->name); #elif defined(HAVE_FTRUNCATE) if ((ret = ftruncate(fh->fd, fh->extend_size)) != 0) WT_RET_MSG(session, ret, "%s: ftruncate", fh->name); #endif } #endif if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!locked) __wt_spin_lock(session, &block->live_lock); WT_TRET( __wt_block_off_free(session, block, offset, align_size)); if (!locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /* * Optionally schedule writes for dirty pages in the system buffer * cache. */ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max) { block->os_cache_dirty = 0; if ((ret = sync_file_range(fh->fd, (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE)) != 0) WT_RET_MSG( session, ret, "%s: sync_file_range", block->name); } #endif #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_CSTAT_INCR(session, block_write); WT_CSTAT_INCRV(session, block_byte_write, align_size); WT_VERBOSE_RET(session, write, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, align_size, blk->cksum); *offsetp = offset; *sizep = align_size; *cksump = blk->cksum; return (ret); }
/* * __wt_rec_evict -- * Reconciliation plus eviction. */ int __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive) { WT_DECL_RET; WT_PAGE_MODIFY *mod; int merge; WT_VERBOSE_RET(session, evict, "page %p (%s)", page, __wt_page_type_string(page->type)); WT_ASSERT(session, session->excl_next == 0); /* * If we get a split-merge page during normal eviction, try to collapse * it. During close, it will be merged into its parent. */ mod = page->modify; merge = __wt_btree_mergeable(page); if (merge && exclusive) return (EBUSY); WT_ASSERT(session, merge || mod == NULL || !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)); /* * Get exclusive access to the page and review the page and its subtree * for conditions that would block our eviction of the page. If the * check fails (for example, we find a child page that can't be merged), * we're done. We have to make this check for clean pages, too: while * unlikely eviction would choose an internal page with children, it's * not disallowed anywhere. * * Note that page->ref may be NULL in some cases (e.g., for root pages * or during salvage). That's OK if exclusive is set: we won't check * hazard pointers in that case. */ WT_ERR(__rec_review(session, page->ref, page, exclusive, merge, 1)); /* Try to merge internal pages. */ if (merge) WT_ERR(__wt_merge_tree(session, page)); /* * Update the page's modification reference, reconciliation might have * changed it. */ mod = page->modify; /* Count evictions of internal pages during normal operation. */ if (!exclusive && !merge && (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) { WT_CSTAT_INCR(session, cache_eviction_internal); WT_DSTAT_INCR(session, cache_eviction_internal); } /* * Update the parent and discard the page. */ if (mod == NULL || !F_ISSET(mod, WT_PM_REC_MASK)) { WT_ASSERT(session, exclusive || page->ref->state == WT_REF_LOCKED); if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else __rec_page_clean_update(session, page); /* Discard the page. */ __rec_discard_page(session, page, exclusive); WT_CSTAT_INCR(session, cache_eviction_clean); WT_DSTAT_INCR(session, cache_eviction_clean); } else { if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else WT_ERR(__rec_page_dirty_update(session, page)); /* Discard the tree rooted in this page. */ __rec_discard_tree(session, page, exclusive); WT_CSTAT_INCR(session, cache_eviction_dirty); WT_DSTAT_INCR(session, cache_eviction_dirty); } if (0) { err: /* * If unable to evict this page, release exclusive reference(s) * we've acquired. */ __rec_excl_clear(session); WT_CSTAT_INCR(session, cache_eviction_fail); WT_DSTAT_INCR(session, cache_eviction_fail); } session->excl_next = 0; return (ret); }