/* * __curds_insert -- * WT_CURSOR.insert method for the data-source cursor type. */ static int __curds_insert(WT_CURSOR *cursor) { WT_CURSOR *source; WT_DECL_RET; WT_SESSION_IMPL *session; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; CURSOR_UPDATE_API_CALL(cursor, session, insert, NULL); WT_ERR(__curds_txn_enter(session)); WT_STAT_FAST_CONN_INCR(session, cursor_insert); WT_STAT_FAST_DATA_INCR(session, cursor_insert); WT_STAT_FAST_DATA_INCRV(session, cursor_insert_bytes, cursor->key.size + cursor->value.size); if (!F_ISSET(cursor, WT_CURSTD_APPEND)) WT_ERR(__curds_key_set(cursor)); WT_ERR(__curds_value_set(cursor)); ret = __curds_cursor_resolve(cursor, source->insert(source)); err: __curds_txn_leave(session); CURSOR_UPDATE_API_END(session, ret); return (ret); }
/* * __curds_remove -- * WT_CURSOR.remove method for the data-source cursor type. */ static int __curds_remove(WT_CURSOR *cursor) { WT_CURSOR *source; WT_DECL_RET; WT_SESSION_IMPL *session; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; CURSOR_UPDATE_API_CALL(cursor, session, remove, NULL); WT_STAT_FAST_CONN_INCR(session, cursor_remove); WT_STAT_FAST_DATA_INCR(session, cursor_remove); WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); WT_ERR(__curds_txn_enter(session)); WT_ERR(__curds_key_set(cursor)); ret = __curds_cursor_resolve(cursor, source->remove(source)); err: __curds_txn_leave(session); CURSOR_UPDATE_API_END(session, ret); return (ret); }
/* * __wt_btcur_next_random -- * Move to a random record in the tree. */ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; /* * Only supports row-store: applications can trivially select a random * value from a column-store, if there were any reason to do so. */ if (btree->type != BTREE_ROW) WT_RET(ENOTSUP); WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); WT_RET(__cursor_func_init(cbt, 1)); WT_ERR(__wt_row_random(session, cbt)); if (__cursor_valid(cbt, &upd)) WT_ERR(__wt_kv_return(session, cbt, upd)); else WT_ERR(__wt_btcur_search_near(cbt, 0)); err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __curlog_search -- * WT_CURSOR.search method for the log cursor type. */ static int __curlog_search(WT_CURSOR *cursor) { WT_CURSOR_LOG *cl; WT_DECL_RET; WT_LSN key; WT_SESSION_IMPL *session; uint32_t counter; cl = (WT_CURSOR_LOG *)cursor; CURSOR_API_CALL(cursor, session, search, NULL); /* * !!! We are ignoring the counter and only searching based on the LSN. */ WT_ERR(__wt_cursor_get_key((WT_CURSOR *)cl, &key.file, &key.offset, &counter)); WT_ERR(__wt_log_scan(session, &key, WT_LOGSCAN_ONE, __curlog_logrec, cl)); WT_ERR(__curlog_kv(session, cursor)); WT_STAT_FAST_CONN_INCR(session, cursor_search); WT_STAT_FAST_DATA_INCR(session, cursor_search); err: API_END_RET(session, ret); }
/* * __ovfl_read -- * Read an overflow item from the disk. */ static int __ovfl_read(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *store) { WT_BTREE *btree; const WT_PAGE_HEADER *dsk; btree = S2BT(session); /* * Read the overflow item from the block manager, then reference the * start of the data and set the data's length. * * Overflow reads are synchronous. That may bite me at some point, but * WiredTiger supports large page sizes, overflow items should be rare. */ WT_RET(__wt_bt_read(session, store, addr, addr_size)); dsk = store->data; store->data = WT_PAGE_HEADER_BYTE(btree, dsk); store->size = dsk->u.datalen; WT_STAT_FAST_DATA_INCR(session, cache_read_overflow); return (0); }
/* * __hazard_exclusive -- * Request exclusive access to a page. */ static int __hazard_exclusive(WT_SESSION_IMPL *session, WT_REF *ref, int top) { /* * Make sure there is space to track exclusive access so we can unlock * to clean up. */ WT_RET(__wt_realloc_def(session, &session->excl_allocated, session->excl_next + 1, &session->excl)); /* * Hazard pointers are acquired down the tree, which means we can't * deadlock. * * Request exclusive access to the page. The top-level page should * already be in the locked state, lock child pages in memory. * If another thread already has this page, give up. */ if (!top && !WT_ATOMIC_CAS(ref->state, WT_REF_MEM, WT_REF_LOCKED)) return (EBUSY); /* We couldn't change the state. */ WT_ASSERT(session, ref->state == WT_REF_LOCKED); session->excl[session->excl_next++] = ref; /* Check for a matching hazard pointer. */ if (__wt_page_hazard_check(session, ref->page) == NULL) return (0); WT_STAT_FAST_DATA_INCR(session, cache_eviction_hazard); WT_STAT_FAST_CONN_INCR(session, cache_eviction_hazard); WT_VERBOSE_RET( session, evict, "page %p hazard request failed", ref->page); return (EBUSY); }
/* * __curbulk_insert_fix_bitmap -- * Fixed-length column-store bulk cursor insert for bitmaps. */ static int __curbulk_insert_fix_bitmap(WT_CURSOR *cursor) { WT_BTREE *btree; WT_CURSOR_BULK *cbulk; WT_DECL_RET; WT_SESSION_IMPL *session; cbulk = (WT_CURSOR_BULK *)cursor; btree = cbulk->cbt.btree; /* * Bulk cursor inserts are updates, but don't need auto-commit * transactions because they are single-threaded and not visible * until the bulk cursor is closed. */ CURSOR_API_CALL(cursor, session, insert, btree); WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); WT_CURSOR_CHECKVALUE(cursor); /* Insert the current record. */ ret = __wt_bulk_insert_fix_bitmap(session, cbulk); err: API_END_RET(session, ret); }
/* * __wt_btcur_next_random -- * Move to a random record in the tree. */ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); WT_RET(__cursor_func_init(cbt, 1)); /* * Only supports row-store: applications can trivially select a random * value from a column-store, if there were any reason to do so. */ WT_ERR(btree->type == BTREE_ROW ? __wt_row_random(session, cbt) : ENOTSUP); ret = cbt->compare == 0 ? __wt_kv_return(session, cbt) : WT_NOTFOUND; err: if (ret != 0) WT_TRET(__cursor_error_resolve(cbt)); return (ret); }
/* * __curlog_next -- * WT_CURSOR.next method for the step log cursor type. */ static int __curlog_next(WT_CURSOR *cursor) { WT_CURSOR_LOG *cl; WT_DECL_RET; WT_SESSION_IMPL *session; cl = (WT_CURSOR_LOG *)cursor; CURSOR_API_CALL(cursor, session, next, NULL); /* * If we don't have a record, or went to the end of the record we * have, or we are in the zero-fill portion of the record, get a * new one. */ if (cl->stepp == NULL || cl->stepp >= cl->stepp_end || !*cl->stepp) { cl->txnid = 0; WT_ERR(__wt_log_scan(session, cl->next_lsn, WT_LOGSCAN_ONE, __curlog_logrec, cl)); } WT_ASSERT(session, cl->logrec->data != NULL); WT_ERR(__curlog_kv(session, cursor)); WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); err: API_END_RET(session, ret); }
/* * __wt_btcur_reset -- * Invalidate the cursor position. */ int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) { WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_reset); WT_STAT_FAST_DATA_INCR(session, cursor_reset); return (__cursor_reset(cbt)); }
/* * __cursor_truncate_fix -- * Discard a cursor range from fixed-width column-store tree. */ static int __cursor_truncate_fix(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop, int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, int)) { WT_DECL_RET; uint8_t *value; /* * Handle fixed-length column-store objects separately: for row-store * and variable-length column-store objects we have "deleted" values * and so returned objects actually exist: fixed-length column-store * objects are filled-in if they don't exist, that is, if you create * record 37, records 1-36 magically appear. Those records can't be * deleted, which means we have to ignore already "deleted" records. * * First, call the standard cursor remove method to do a full search and * re-position the cursor because we don't have a saved copy of the * page's write generation information, which we need to remove records. * Once that's done, we can delete records without a full search, unless * we encounter a restart error because the page was modified by some * other thread of control; in that case, repeat the full search to * refresh the page's modification information. */ retry: WT_RET(__wt_btcur_remove(start)); /* * Reset ret each time through so that we don't loop forever in * the cursor equals case. */ for (ret = 0;;) { if (stop != NULL && __cursor_equals(start, stop)) break; if ((ret = __wt_btcur_next(start, 1)) != 0) break; start->compare = 0; /* Exact match */ value = (uint8_t *)start->iface.value.data; if (*value != 0 && (ret = rmfunc(session, start, 1)) != 0) break; } if (ret == WT_RESTART) { WT_STAT_FAST_CONN_INCR(session, cursor_restart); WT_STAT_FAST_DATA_INCR(session, cursor_restart); goto retry; } WT_RET_NOTFOUND_OK(ret); return (0); }
/* * __cursor_truncate -- * Discard a cursor range from row-store or variable-width column-store * tree. */ static int __cursor_truncate(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop, int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, int)) { WT_DECL_RET; /* * First, call the standard cursor remove method to do a full search and * re-position the cursor because we don't have a saved copy of the * page's write generation information, which we need to remove records. * Once that's done, we can delete records without a full search, unless * we encounter a restart error because the page was modified by some * other thread of control; in that case, repeat the full search to * refresh the page's modification information. * * If this is a row-store, we delete leaf pages having no overflow items * without reading them; for that to work, we have to ensure we read the * page referenced by the ending cursor, since we may be deleting only a * partial page at the end of the truncation. Our caller already fully * instantiated the end cursor, so we know that page is pinned in memory * and we can proceed without concern. */ retry: WT_RET(__wt_btcur_remove(start)); /* * Reset ret each time through so that we don't loop forever in * the cursor equals case. */ for (ret = 0;;) { if (stop != NULL && __cursor_equals(start, stop)) break; if ((ret = __wt_btcur_next(start, 1)) != 0) break; start->compare = 0; /* Exact match */ if ((ret = rmfunc(session, start, 1)) != 0) break; } if (ret == WT_RESTART) { WT_STAT_FAST_CONN_INCR(session, cursor_restart); WT_STAT_FAST_DATA_INCR(session, cursor_restart); goto retry; } WT_RET_NOTFOUND_OK(ret); return (0); }
/* * __curbulk_insert_fix -- * Fixed-length column-store bulk cursor insert. */ static int __curbulk_insert_fix(WT_CURSOR *cursor) { WT_BTREE *btree; WT_CURSOR_BULK *cbulk; WT_DECL_RET; WT_SESSION_IMPL *session; uint64_t recno; cbulk = (WT_CURSOR_BULK *)cursor; btree = cbulk->cbt.btree; /* * Bulk cursor inserts are updates, but don't need auto-commit * transactions because they are single-threaded and not visible * until the bulk cursor is closed. */ CURSOR_API_CALL(cursor, session, insert, btree); WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); /* * If the "append" flag was configured, the application doesn't have to * supply a key, else require a key. */ if (F_ISSET(cursor, WT_CURSTD_APPEND)) recno = cbulk->recno + 1; else { WT_CURSOR_CHECKKEY(cursor); if ((recno = cursor->recno) <= cbulk->recno) WT_ERR(__bulk_col_keycmp_err(cbulk)); } WT_CURSOR_CHECKVALUE(cursor); /* * Insert any skipped records as deleted records, update the current * record count. */ for (; recno != cbulk->recno + 1; ++cbulk->recno) WT_ERR(__wt_bulk_insert_fix(session, cbulk, true)); cbulk->recno = recno; /* Insert the current record. */ ret = __wt_bulk_insert_fix(session, cbulk, false); err: API_END_RET(session, ret); }
/* * __curbulk_insert_row -- * Row-store bulk cursor insert, with key-sort checks. */ static int __curbulk_insert_row(WT_CURSOR *cursor) { WT_BTREE *btree; WT_CURSOR_BULK *cbulk; WT_DECL_RET; WT_SESSION_IMPL *session; int cmp; cbulk = (WT_CURSOR_BULK *)cursor; btree = cbulk->cbt.btree; /* * Bulk cursor inserts are updates, but don't need auto-commit * transactions because they are single-threaded and not visible * until the bulk cursor is closed. */ CURSOR_API_CALL(cursor, session, insert, btree); WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); WT_CURSOR_CHECKKEY(cursor); WT_CURSOR_CHECKVALUE(cursor); /* * If this isn't the first key inserted, compare it against the last key * to ensure the application doesn't accidentally corrupt the table. */ if (!cbulk->first_insert) { WT_ERR(__wt_compare(session, btree->collator, &cursor->key, &cbulk->last, &cmp)); if (cmp <= 0) WT_ERR(__bulk_row_keycmp_err(cbulk)); } else cbulk->first_insert = false; /* Save a copy of the key for the next comparison. */ WT_ERR(__wt_buf_set(session, &cbulk->last, cursor->key.data, cursor->key.size)); ret = __wt_bulk_insert_row(session, cbulk); err: API_END_RET(session, ret); }
/* * __truncate_dsrc -- * WT_SESSION::truncate for a data-source without a truncate operation. */ static int __truncate_dsrc(WT_SESSION_IMPL *session, const char *uri) { WT_CURSOR *cursor; WT_DECL_RET; const char *cfg[2]; /* Open a cursor and traverse the object, removing every entry. */ cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); cfg[1] = NULL; WT_RET(__wt_open_cursor(session, uri, NULL, cfg, &cursor)); while ((ret = cursor->next(cursor)) == 0) WT_ERR(cursor->remove(cursor)); WT_ERR_NOTFOUND_OK(ret); WT_STAT_FAST_DATA_INCR(session, cursor_truncate); err: WT_TRET(cursor->close(cursor)); return (ret); }
/* * __wt_btcur_search -- * Search for a matching record in the tree. */ int __wt_btcur_search(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_FAST_CONN_INCR(session, cursor_search); WT_STAT_FAST_DATA_INCR(session, cursor_search); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_func_init(cbt, 1)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, 0) : __cursor_col_search(session, cbt)); if (cbt->compare == 0 && __cursor_valid(cbt, &upd)) ret = __wt_kv_return(session, cbt, upd); else if (__cursor_fix_implicit(btree, cbt)) { /* * Creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. */ cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; } else ret = WT_NOTFOUND; err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __curds_reset -- * WT_CURSOR.reset method for the data-source cursor type. */ static int __curds_reset(WT_CURSOR *cursor) { WT_CURSOR *source; WT_DECL_RET; WT_SESSION_IMPL *session; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; CURSOR_API_CALL(cursor, session, reset, NULL); WT_STAT_FAST_CONN_INCR(session, cursor_reset); WT_STAT_FAST_DATA_INCR(session, cursor_reset); WT_ERR(source->reset(source)); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); err: API_END_RET(session, ret); }
/* * __curds_prev -- * WT_CURSOR.prev method for the data-source cursor type. */ static int __curds_prev(WT_CURSOR *cursor) { WT_CURSOR *source; WT_DECL_RET; WT_SESSION_IMPL *session; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; CURSOR_API_CALL(cursor, session, prev, NULL); WT_STAT_FAST_CONN_INCR(session, cursor_prev); WT_STAT_FAST_DATA_INCR(session, cursor_prev); WT_ERR(__curds_txn_enter(session)); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); ret = __curds_cursor_resolve(cursor, source->prev(source)); err: __curds_txn_leave(session); API_END_RET(session, ret); }
/* * __wt_btcur_update_check -- * Check whether an update would conflict. * * This can be used to replace WT_CURSOR::insert or WT_CURSOR::update, so * they only check for conflicts without updating the tree. It is used to * maintain snapshot isolation for transactions that span multiple chunks * in an LSM tree. */ int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; cursor = &cbt->iface; btree = cbt->btree; session = (WT_SESSION_IMPL *)cursor->session; retry: WT_RET(__cursor_func_init(cbt, 1)); switch (btree->type) { case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, NULL, 1)); /* * Just check for conflicts. */ ret = __curfile_update_check(cbt); break; case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ILLEGAL_VALUE_ERR(session); } err: if (ret == WT_RESTART) { WT_STAT_FAST_CONN_INCR(session, cursor_restart); WT_STAT_FAST_DATA_INCR(session, cursor_restart); goto retry; } WT_TRET(__curfile_leave(cbt)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __curds_search -- * WT_CURSOR.search method for the data-source cursor type. */ static int __curds_search(WT_CURSOR *cursor) { WT_CURSOR *source; WT_DECL_RET; WT_SESSION_IMPL *session; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; CURSOR_API_CALL(cursor, session, search, NULL); WT_STAT_FAST_CONN_INCR(session, cursor_search); WT_STAT_FAST_DATA_INCR(session, cursor_search); WT_ERR(__curds_txn_enter(session)); WT_ERR(__curds_key_set(cursor)); ret = __curds_cursor_resolve(cursor, source->search(source)); err: __curds_txn_leave(session); API_END_RET(session, ret); }
/* * __truncate_table -- * WT_SESSION::truncate for a table. */ static int __truncate_table(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) { WT_DECL_RET; WT_TABLE *table; u_int i; WT_RET(__wt_schema_get_table(session, uri, strlen(uri), false, &table)); WT_STAT_FAST_DATA_INCR(session, cursor_truncate); /* Truncate the column groups. */ for (i = 0; i < WT_COLGROUPS(table); i++) WT_ERR(__wt_schema_truncate( session, table->cgroups[i]->source, cfg)); /* Truncate the indices. */ WT_ERR(__wt_schema_open_indices(session, table)); for (i = 0; i < table->nindices; i++) WT_ERR(__wt_schema_truncate( session, table->indices[i]->source, cfg)); err: __wt_schema_release_table(session, table); return (ret); }
/* * __wt_curfile_create -- * Open a cursor for a given btree handle. */ int __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __curfile_compare, /* compare */ __curfile_equals, /* equals */ __curfile_next, /* next */ __curfile_prev, /* prev */ __curfile_reset, /* reset */ __curfile_search, /* search */ __curfile_search_near, /* search-near */ __curfile_insert, /* insert */ __curfile_update, /* update */ __curfile_remove, /* remove */ __wt_cursor_reconfigure, /* reconfigure */ __curfile_close); /* close */ WT_BTREE *btree; WT_CONFIG_ITEM cval; WT_CURSOR *cursor; WT_CURSOR_BTREE *cbt; WT_CURSOR_BULK *cbulk; WT_DECL_RET; size_t csize; WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0); cbt = NULL; btree = S2BT(session); WT_ASSERT(session, btree != NULL); csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE); WT_RET(__wt_calloc(session, 1, csize, &cbt)); cursor = &cbt->iface; *cursor = iface; cursor->session = &session->iface; cursor->internal_uri = btree->dhandle->name; cursor->key_format = btree->key_format; cursor->value_format = btree->value_format; cbt->btree = btree; if (bulk) { F_SET(cursor, WT_CURSTD_BULK); cbulk = (WT_CURSOR_BULK *)cbt; /* Optionally skip the validation of each bulk-loaded key. */ WT_ERR(__wt_config_gets_def( session, cfg, "skip_sort_check", 0, &cval)); WT_ERR(__wt_curbulk_init( session, cbulk, bitmap, cval.val == 0 ? 0 : 1)); } /* * random_retrieval * Random retrieval cursors only support next, reset and close. */ WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval)); if (cval.val != 0) { __wt_cursor_set_notsup(cursor); cursor->next = __curfile_next_random; cursor->reset = __curfile_reset; } /* Underlying btree initialization. */ __wt_btcur_open(cbt); /* __wt_cursor_init is last so we don't have to clean up on error. */ WT_ERR(__wt_cursor_init( cursor, cursor->internal_uri, owner, cfg, cursorp)); WT_STAT_FAST_CONN_INCR(session, cursor_create); WT_STAT_FAST_DATA_INCR(session, cursor_create); if (0) { err: __wt_free(session, cbt); } return (ret); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; bool newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_prev); WT_STAT_FAST_DATA_INCR(session, cursor_prev); flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */ if (truncating) LF_SET(WT_READ_TRUNCATE); WT_RET(__cursor_func_init(cbt, false)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = true; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } /* * If we saw a lot of deleted records on this page, or we went * all the way through a page and only saw deleted records, try * to evict the page when we release it. Otherwise repeatedly * deleting from the beginning of a tree can have quadratic * performance. Take care not to force eviction of pages that * are genuinely empty, in new trees. */ if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))) __wt_page_evict_soon(page); cbt->page_deleted_count = 0; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_bt_read -- * Read a cookie referenced block into a buffer. */ int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(tmp); WT_DECL_RET; const WT_PAGE_HEADER *dsk; size_t result_len; btree = S2BT(session); bm = btree->bm; /* * If anticipating a compressed block, read into a scratch buffer and * decompress into the caller's buffer. Else, read directly into the * caller's buffer. */ if (btree->compressor == NULL) { WT_RET(bm->read(bm, session, buf, addr, addr_size)); dsk = buf->data; } else { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->read(bm, session, tmp, addr, addr_size)); dsk = tmp->data; } /* * If the block is compressed, copy the skipped bytes of the original * image into place, then decompress. */ if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || btree->compressor->decompress == NULL) WT_ERR_MSG(session, WT_ERROR, "read compressed block where no compression engine " "configured"); /* * We're allocating the exact number of bytes we're expecting * from decompression. */ WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size)); /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, tmp->data, WT_BLOCK_COMPRESS_SKIP); ret = btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len); /* * If checksums were turned off because we're depending on the * decompression to fail on any corrupted data, we'll end up * here after corruption happens. If we're salvaging the file, * it's OK, otherwise it's really, really bad. */ if (ret != 0 || result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) WT_ERR( F_ISSET(btree, WT_BTREE_VERIFY) || F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? WT_ERROR : __wt_illegal_value(session, btree->dhandle->name)); } else if (btree->compressor == NULL) buf->size = dsk->mem_size; else /* * We guessed wrong: there was a compressor, but this * block was not compressed, and now the page is in the * wrong buffer and the buffer may be of the wrong size. * This should be rare, but happens with small blocks * that aren't worth compressing. */ WT_ERR(__wt_buf_set( session, buf, tmp->data, dsk->mem_size)); /* If the handle is a verify handle, verify the physical page. */ if (F_ISSET(btree, WT_BTREE_VERIFY)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf)); } WT_STAT_FAST_CONN_INCR(session, cache_read); WT_STAT_FAST_DATA_INCR(session, cache_read); if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_STAT_FAST_DATA_INCR(session, compress_read); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_bt_write -- * Write a buffer into a block, returning the block's addr/size and * checksum. */ int __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, bool checkpoint, bool compressed) { WT_BM *bm; WT_BTREE *btree; WT_ITEM *ip; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t dst_len, len, result_len, size, src_len; int compression_failed; /* Extension API, so not a bool. */ uint8_t *dst, *src; bool data_cksum; btree = S2BT(session); bm = btree->bm; /* Checkpoint calls are different than standard calls. */ WT_ASSERT(session, (!checkpoint && addr != NULL && addr_sizep != NULL) || (checkpoint && addr == NULL && addr_sizep == NULL)); #ifdef HAVE_DIAGNOSTIC /* * We're passed a table's disk image. Decompress if necessary and * verify the image. Always check the in-memory length for accuracy. */ dsk = buf->mem; if (compressed) { WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &tmp)); memcpy(tmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP); WT_ERR(btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP, buf->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP, tmp->memsize - WT_BLOCK_COMPRESS_SKIP, &result_len)); WT_ASSERT(session, dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP); tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP; ip = tmp; } else { WT_ASSERT(session, dsk->mem_size == buf->size); ip = buf; } WT_ERR(__wt_verify_dsk(session, "[write-check]", ip)); __wt_scr_free(session, &tmp); #endif /* * Optionally stream-compress the data, but don't compress blocks that * are already as small as they're going to get. */ if (btree->compressor == NULL || btree->compressor->compress == NULL || compressed) ip = buf; else if (buf->size <= btree->allocsize) { ip = buf; WT_STAT_FAST_DATA_INCR(session, compress_write_too_small); } else { /* Skip the header bytes of the source data. */ src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP; src_len = buf->size - WT_BLOCK_COMPRESS_SKIP; /* * Compute the size needed for the destination buffer. We only * allocate enough memory for a copy of the original by default, * if any compressed version is bigger than the original, we * won't use it. However, some compression engines (snappy is * one example), may need more memory because they don't stop * just because there's no more memory into which to compress. */ if (btree->compressor->pre_size == NULL) len = src_len; else WT_ERR(btree->compressor->pre_size(btree->compressor, &session->iface, src, src_len, &len)); size = len + WT_BLOCK_COMPRESS_SKIP; WT_ERR(bm->write_size(bm, session, &size)); WT_ERR(__wt_scr_alloc(session, size, &tmp)); /* Skip the header bytes of the destination data. */ dst = (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP; dst_len = len; compression_failed = 0; WT_ERR(btree->compressor->compress(btree->compressor, &session->iface, src, src_len, dst, dst_len, &result_len, &compression_failed)); result_len += WT_BLOCK_COMPRESS_SKIP; /* * If compression fails, or doesn't gain us at least one unit of * allocation, fallback to the original version. This isn't * unexpected: if compression doesn't work for some chunk of * data for some reason (noting likely additional format/header * information which compressed output requires), it just means * the uncompressed version is as good as it gets, and that's * what we use. */ if (compression_failed || buf->size / btree->allocsize <= result_len / btree->allocsize) { ip = buf; WT_STAT_FAST_DATA_INCR(session, compress_write_fail); } else { compressed = true; WT_STAT_FAST_DATA_INCR(session, compress_write); /* * Copy in the skipped header bytes, set the final data * size. */ memcpy(tmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP); tmp->size = result_len; ip = tmp; } } dsk = ip->mem; /* If the buffer is compressed, set the flag. */ if (compressed) F_SET(dsk, WT_PAGE_COMPRESSED); /* * We increment the block's write generation so it's easy to identify * newer versions of blocks during salvage. (It's common in WiredTiger, * at least for the default block manager, for multiple blocks to be * internally consistent with identical first and last keys, so we need * a way to know the most recent state of the block. We could check * which leaf is referenced by a valid internal page, but that implies * salvaging internal pages, which I don't want to do, and it's not * as good anyway, because the internal page may not have been written * after the leaf page was updated. So, write generations it is. * * Nothing is locked at this point but two versions of a page with the * same generation is pretty unlikely, and if we did, they're going to * be roughly identical for the purposes of salvage, anyway. */ dsk->write_gen = ++btree->write_gen; /* * Checksum the data if the buffer isn't compressed or checksums are * configured. */ switch (btree->checksum) { case CKSUM_ON: data_cksum = true; break; case CKSUM_OFF: data_cksum = false; break; case CKSUM_UNCOMPRESSED: default: data_cksum = !compressed; break; } /* Call the block manager to write the block. */ WT_ERR(checkpoint ? bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) : bm->write(bm, session, ip, addr, addr_sizep, data_cksum)); WT_STAT_FAST_CONN_INCR(session, cache_write); WT_STAT_FAST_DATA_INCR(session, cache_write); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, dsk->mem_size); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_btcur_update -- * Update a record in the tree. */ int __wt_btcur_update(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_FAST_CONN_INCR(session, cursor_update); WT_STAT_FAST_DATA_INCR(session, cursor_update); WT_STAT_FAST_DATA_INCRV( session, cursor_update_bytes, cursor->value.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); /* * The tree is no longer empty: eviction should pay attention to it, * and it's no longer possible to bulk-load into it. */ if (btree->bulk_load_ok) { btree->bulk_load_ok = 0; __wt_btree_evictable(session, 1); } retry: WT_RET(__cursor_func_init(cbt, 1)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__cursor_col_search(session, cbt)); /* * If not overwriting, fail if the key doesn't exist. Update * the record if it exists. Creating a record past the end of * the tree in a fixed-length column-store implicitly fills the * gap with empty records. Update the record in that case, the * record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); ret = __cursor_col_modify(session, cbt, 0); break; case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, 1)); /* * If not overwriting, fail if the key does not exist. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && (cbt->compare != 0 || !__cursor_valid(cbt, NULL))) WT_ERR(WT_NOTFOUND); ret = __cursor_row_modify(session, cbt, 0); break; WT_ILLEGAL_VALUE_ERR(session); } err: if (ret == WT_RESTART) goto retry; /* * If successful, point the cursor at internal copies of the data. We * could shuffle memory in the cursor so the key/value pair are in local * buffer memory, but that's a data copy. We don't want to do another * search (and we might get a different update structure if we race). * To make this work, we add a field to the btree cursor to pass back a * pointer to the modify function's allocated update structure. */ if (ret == 0) WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_remove -- * Remove a record from the tree. */ int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_FAST_CONN_INCR(session, cursor_remove); WT_STAT_FAST_DATA_INCR(session, cursor_remove); WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); retry: WT_RET(__cursor_func_init(cbt, 1)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__cursor_col_search(session, cbt)); /* Remove the record if it exists. */ if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) { if (!__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); /* * Creating a record past the end of the tree in a * fixed-length column-store implicitly fills the * gap with empty records. Return success in that * case, the record was deleted successfully. * * Correct the btree cursor's location: the search * will have pointed us at the previous/next item, * and that's not correct. */ cbt->recno = cursor->recno; } else ret = __cursor_col_modify(session, cbt, 1); break; case BTREE_ROW: /* Remove the record if it exists. */ WT_ERR(__cursor_row_search(session, cbt, 0)); if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); ret = __cursor_row_modify(session, cbt, 1); break; WT_ILLEGAL_VALUE_ERR(session); } err: if (ret == WT_RESTART) goto retry; /* * If the cursor is configured to overwrite and the record is not * found, that is exactly what we want. */ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND) ret = 0; if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_insert -- * Insert a record into the tree. */ int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_FAST_CONN_INCR(session, cursor_insert); WT_STAT_FAST_DATA_INCR(session, cursor_insert); WT_STAT_FAST_DATA_INCRV(session, cursor_insert_bytes, cursor->key.size + cursor->value.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); /* * The tree is no longer empty: eviction should pay attention to it, * and it's no longer possible to bulk-load into it. */ if (btree->bulk_load_ok) { btree->bulk_load_ok = 0; __wt_btree_evictable(session, 1); } retry: WT_RET(__cursor_func_init(cbt, 1)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: /* * If WT_CURSTD_APPEND is set, insert a new record (ignoring * the application's record number). First we search for the * maximum possible record number so the search ends on the * last page. The real record number is assigned by the * serialized append operation. */ if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = UINT64_MAX; WT_ERR(__cursor_col_search(session, cbt)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = 0; /* * If not overwriting, fail if the key exists. Creating a * record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. * Fail in that case, the record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) || (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) WT_ERR(WT_DUPLICATE_KEY); WT_ERR(__cursor_col_modify(session, cbt, 0)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = cbt->recno; break; case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, 1)); /* * If not overwriting, fail if the key exists, else insert the * key/value pair. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && cbt->compare == 0 && __cursor_valid(cbt, NULL)) WT_ERR(WT_DUPLICATE_KEY); ret = __cursor_row_modify(session, cbt, 0); break; WT_ILLEGAL_VALUE_ERR(session); } err: if (ret == WT_RESTART) goto retry; /* Insert doesn't maintain a position across calls, clear resources. */ if (ret == 0) WT_TRET(__curfile_leave(cbt)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_search_near -- * Search for a record in the tree. */ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; int exact; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; exact = 0; WT_STAT_FAST_CONN_INCR(session, cursor_search_near); WT_STAT_FAST_DATA_INCR(session, cursor_search_near); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_func_init(cbt, 1)); /* * Set the "insert" flag for the btree row-store search; we may intend * to position our cursor at the end of the tree, rather than match an * existing record. */ WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, 1) : __cursor_col_search(session, cbt)); /* * If we find an valid key, return it. * * Else, creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. In this * case, we instantiate the empty record, it's an exact match. * * Else, move to the next key in the tree (bias for prefix searches). * Cursor next skips invalid rows, so we don't have to test for them * again. * * Else, redo the search and move to the previous key in the tree. * Cursor previous skips invalid rows, so we don't have to test for * them again. * * If that fails, quit, there's no record to return. */ if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; exact = 0; } else if ((ret = __wt_btcur_next(cbt, 0)) != WT_NOTFOUND) exact = 1; else { WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, 1) : __cursor_col_search(session, cbt)); if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, 0)) != WT_NOTFOUND) exact = -1; } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) *exactp = exact; return (ret); }
/* * __wt_ovfl_cache -- * Handle deletion of an overflow value. */ int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack) { int visible; /* * This function solves a problem in reconciliation. The scenario is: * - reconciling a leaf page that references an overflow item * - the item is updated and the update committed * - a checkpoint runs, freeing the backing overflow blocks * - a snapshot transaction wants the original version of the item * * In summary, we may need the original version of an overflow item for * a snapshot transaction after the item was deleted from a page that's * subsequently been checkpointed, where the checkpoint must know about * the freed blocks. We don't have any way to delay a free of the * underlying blocks until a particular set of transactions exit (and * this shouldn't be a common scenario), so cache the overflow value in * memory. * * This gets hard because the snapshot transaction reader might: * - search the WT_UPDATE list and not find an useful entry * - read the overflow value's address from the on-page cell * - go to sleep * - checkpoint runs, caches the overflow value, frees the blocks * - another thread allocates and overwrites the blocks * - the reader wakes up and reads the wrong value * * Use a read/write lock and the on-page cell to fix the problem: hold * a write lock when changing the cell type from WT_CELL_VALUE_OVFL to * WT_CELL_VALUE_OVFL_RM and hold a read lock when reading an overflow * item. * * The read/write lock is per btree, but it could be per page or even * per overflow item. We don't do any of that because overflow values * are supposed to be rare and we shouldn't see contention for the lock. * * Check for a globally visible update. If there is a globally visible * update, we don't need to cache the item because it's not possible for * a running thread to have moved past it. */ switch (page->type) { case WT_PAGE_COL_VAR: visible = __ovfl_cache_col_visible(session, cookie, vpack); break; case WT_PAGE_ROW_LEAF: visible = __ovfl_cache_row_visible(session, page, cookie); break; WT_ILLEGAL_VALUE(session); } /* * If there's no globally visible update, there's a reader in the system * that might try and read the old value, cache it. */ if (!visible) { WT_RET(__ovfl_cache(session, page, vpack)); WT_STAT_FAST_DATA_INCR(session, cache_overflow_value); } /* * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the * underlying overflow value's blocks to be freed when reconciliation * completes. */ return (__wt_ovfl_discard_add(session, page, vpack->cell)); }