/* * __wt_btcur_compare -- * Return a comparison between two cursors. */ int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp) { WT_BTREE *btree; WT_CURSOR *a, *b; WT_SESSION_IMPL *session; a = (WT_CURSOR *)a_arg; b = (WT_CURSOR *)b_arg; btree = a_arg->btree; session = (WT_SESSION_IMPL *)a->session; switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: /* * Compare the interface's cursor record, not the underlying * cursor reference: the interface's cursor reference is the * one being returned to the application. */ if (a->recno < b->recno) *cmpp = -1; else if (a->recno == b->recno) *cmpp = 0; else *cmpp = 1; break; case BTREE_ROW: WT_RET(__wt_compare( session, btree->collator, &a->key, &b->key, cmpp)); break; WT_ILLEGAL_VALUE(session); } return (0); }
/* * __stat_page -- * Stat any Btree page. */ static int __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) { /* * All internal pages and overflow pages are trivial, all we track is * a count of the page type. */ switch (page->type) { case WT_PAGE_COL_FIX: WT_STAT_INCR(session, stats, btree_column_fix); WT_STAT_INCRV( session, stats, btree_entries, page->pg_fix_entries); break; case WT_PAGE_COL_INT: WT_STAT_INCR(session, stats, btree_column_internal); break; case WT_PAGE_COL_VAR: __stat_page_col_var(session, page, stats); break; case WT_PAGE_ROW_INT: __stat_page_row_int(session, page, stats); break; case WT_PAGE_ROW_LEAF: __stat_page_row_leaf(session, page, stats); break; WT_ILLEGAL_VALUE(session); } return (0); }
/* * wiredtiger_pack_str -- * Pack a string. */ int wiredtiger_pack_str(WT_PACK_STREAM *ps, const char *s) { WT_DECL_PACK_VALUE(pv); WT_SESSION_IMPL *session; session = ps->pack.session; /* Lower-level packing routines treat a length of zero as unchecked. */ if (ps->p >= ps->end) return (ENOMEM); WT_RET(__pack_next(&ps->pack, &pv)); switch (pv.type) { case 'S': case 's': pv.u.s = s; WT_RET(__pack_write( session, &pv, &ps->p, (size_t)(ps->end - ps->p))); break; WT_ILLEGAL_VALUE(session); } return (0); }
/* * wiredtiger_pack_item -- * Pack an item. */ int wiredtiger_pack_item(WT_PACK_STREAM *ps, WT_ITEM *item) { WT_DECL_PACK_VALUE(pv); WT_SESSION_IMPL *session; session = ps->pack.session; /* Lower-level packing routines treat a length of zero as unchecked. */ if (ps->p >= ps->end) return (ENOMEM); WT_RET(__pack_next(&ps->pack, &pv)); switch (pv.type) { case 'U': case 'u': pv.u.item.data = item->data; pv.u.item.size = item->size; WT_RET(__pack_write( session, &pv, &ps->p, (size_t)(ps->end - ps->p))); break; WT_ILLEGAL_VALUE(session); } return (0); }
/* * wiredtiger_unpack_uint -- * Unpack an unsigned integer. */ int wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up) { WT_DECL_PACK_VALUE(pv); WT_SESSION_IMPL *session; session = ps->pack.session; /* Lower-level packing routines treat a length of zero as unchecked. */ if (ps->p >= ps->end) return (ENOMEM); WT_RET(__pack_next(&ps->pack, &pv)); switch (pv.type) { case 'B': case 'H': case 'I': case 'L': case 'Q': case 'R': case 'r': case 't': WT_RET(__unpack_read(session, &pv, (const uint8_t **)&ps->p, (size_t)(ps->end - ps->p))); *up = pv.u.u; break; WT_ILLEGAL_VALUE(session); } return (0); }
/* * __wt_cache_op -- * Cache operations. */ int __wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op) { switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_CLOSE: /* * Make sure the checkpoint reference is set for * reconciliation; it's ugly, but drilling a function parameter * path from our callers to the reconciliation of the tree's * root page is going to be worse. */ WT_ASSERT(session, S2BT(session)->ckpt != NULL); break; case WT_SYNC_DISCARD: case WT_SYNC_WRITE_LEAVES: break; } switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_WRITE_LEAVES: return (__sync_file(session, op)); case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: return (__wt_evict_file(session, op)); WT_ILLEGAL_VALUE(session); } }
/* * __wt_debug_disk -- * Dump a disk page in debugging mode. */ int __wt_debug_disk( WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile) { WT_DBG *ds, _ds; ds = &_ds; WT_RET(__debug_config(session, ds, ofile)); WT_RET(ds->f(ds, "%s page", __wt_page_type_string(dsk->type))); switch (dsk->type) { case WT_PAGE_BLOCK_MANAGER: break; case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: WT_RET(ds->f(ds, ", recno %" PRIu64, dsk->recno)); /* FALLTHROUGH */ case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(ds->f(ds, ", entries %" PRIu32, dsk->u.entries)); break; case WT_PAGE_OVFL: WT_RET(ds->f(ds, ", datalen %" PRIu32, dsk->u.datalen)); break; WT_ILLEGAL_VALUE(session); } if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_RET(ds->f(ds, ", compressed")); if (F_ISSET(dsk, WT_PAGE_ENCRYPTED)) WT_RET(ds->f(ds, ", encrypted")); if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL)) WT_RET(ds->f(ds, ", empty-all")); if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE)) WT_RET(ds->f(ds, ", empty-none")); if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE)) WT_RET(ds->f(ds, ", LAS-update")); WT_RET(ds->f(ds, ", generation %" PRIu64 "\n", dsk->write_gen)); switch (dsk->type) { case WT_PAGE_BLOCK_MANAGER: break; case WT_PAGE_COL_FIX: WT_RET(__debug_dsk_col_fix(ds, dsk)); break; case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(__debug_dsk_cell(ds, dsk)); break; default: break; } return (__dmsg_wrapup(ds)); }
/* * __wt_btcur_update -- * Update a record in the tree. */ int __wt_btcur_update(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_SESSION_IMPL *session; int ret; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_updates); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); retry: __cursor_func_init(cbt, 1); switch (btree->type) { case BTREE_COL_FIX: if (cursor->value.size != 1) WT_RET_MSG(session, EINVAL, "item size of %" PRIu32 " does not match " "fixed-length file requirement of 1 byte", cursor->value.size); /* FALLTHROUGH */ case BTREE_COL_VAR: WT_ERR(__wt_col_search(session, cbt, 1)); /* * Update the record if it exists. Creating a record past the * end of the tree in a fixed-length column-store implicitly * fills the gap with empty records. Update the record in that * case, the record exists. */ if ((cbt->compare != 0 || __cursor_invalid(cbt)) && !__cursor_fix_implicit(btree, cbt)) ret = WT_NOTFOUND; else if ((ret = __wt_col_modify(session, cbt, 3)) == WT_RESTART) goto retry; break; case BTREE_ROW: /* Update the record it it exists. */ WT_ERR(__wt_row_search(session, cbt, 1)); if (cbt->compare != 0 || __cursor_invalid(cbt)) ret = WT_NOTFOUND; else if ((ret = __wt_row_modify(session, cbt, 0)) == WT_RESTART) goto retry; break; WT_ILLEGAL_VALUE(session); } err: __cursor_func_resolve(cbt, ret); return (ret); }
/* * __rec_page_dirty_update -- * Update a dirty page's reference on eviction. */ static int __rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_ADDR *addr; WT_PAGE_MODIFY *mod; WT_REF *parent_ref; mod = page->modify; parent_ref = page->ref; switch (F_ISSET(mod, WT_PM_REC_MASK)) { case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ if (parent_ref->addr != NULL && __wt_off_page(page->parent, parent_ref->addr)) { __wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr); __wt_free(session, parent_ref->addr); } /* * Update the parent to reference the replacement page. * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); *addr = mod->u.replace; mod->u.replace.addr = NULL; mod->u.replace.size = 0; parent_ref->page = NULL; parent_ref->addr = addr; WT_PUBLISH(parent_ref->state, WT_REF_DISK); break; case WT_PM_REC_SPLIT: /* Page split */ /* * Update the parent to reference new internal page(s). * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ parent_ref->page = mod->u.split; WT_PUBLISH(parent_ref->state, WT_REF_MEM); /* Clear the reference else discarding the page will free it. */ mod->u.split = NULL; F_CLR(mod, WT_PM_REC_SPLIT); break; case WT_PM_REC_EMPTY: /* Page is empty */ /* We checked if the page was empty when we reviewed it. */ /* FALLTHROUGH */ WT_ILLEGAL_VALUE(session); } return (0); }
/* * __wt_cursor_key_order_check -- * Check key ordering for cursor movements. */ int __wt_cursor_key_order_check( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next) { switch (cbt->ref->page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: return (__cursor_key_order_check_col(session, cbt, next)); case WT_PAGE_ROW_LEAF: return (__cursor_key_order_check_row(session, cbt, next)); WT_ILLEGAL_VALUE(session); } /* NOTREACHED */ }
/* * __wt_btcur_remove -- * Remove a record from the tree. */ int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_SESSION_IMPL *session; int ret; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_removes); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); retry: __cursor_func_init(cbt, 1); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__wt_col_search(session, cbt, 1)); /* * Remove the record if it exists. Creating a record past the * end of the tree in a fixed-length column-store implicitly * fills the gap with empty records. Return success in that * case, the record was deleted successfully. */ if (cbt->compare != 0 || __cursor_invalid(cbt)) ret = __cursor_fix_implicit(btree, cbt) ? 0 : WT_NOTFOUND; else if ((ret = __wt_col_modify(session, cbt, 2)) == WT_RESTART) goto retry; break; case BTREE_ROW: /* Remove the record if it exists. */ WT_ERR(__wt_row_search(session, cbt, 1)); if (cbt->compare != 0 || __cursor_invalid(cbt)) ret = WT_NOTFOUND; else if ((ret = __wt_row_modify(session, cbt, 1)) == WT_RESTART) goto retry; break; WT_ILLEGAL_VALUE(session); } err: __cursor_func_resolve(cbt, ret); return (ret); }
/* * __wt_ovfl_discard -- * Discard an on-page overflow value, and reset the page's cell. */ int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) { WT_BM *bm; WT_BTREE *btree; WT_CELL_UNPACK *unpack, _unpack; WT_DECL_RET; btree = S2BT(session); bm = btree->bm; unpack = &_unpack; __wt_cell_unpack(cell, unpack); /* * Finally remove overflow key/value objects, called when reconciliation * finishes after successfully writing a page. * * Keys must have already been instantiated and value objects must have * already been cached (if they might potentially still be read by any * running transaction). * * Acquire the overflow lock to avoid racing with a thread reading the * backing overflow blocks. */ WT_RET(__wt_writelock(session, btree->ovfl_lock)); switch (unpack->raw) { case WT_CELL_KEY_OVFL: __wt_cell_type_reset(session, unpack->cell, WT_CELL_KEY_OVFL, WT_CELL_KEY_OVFL_RM); break; case WT_CELL_VALUE_OVFL: __wt_cell_type_reset(session, unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM); break; WT_ILLEGAL_VALUE(session); } WT_TRET(__wt_writeunlock(session, btree->ovfl_lock)); /* Free the backing disk blocks. */ WT_TRET(bm->free(bm, session, unpack->data, unpack->size)); return (ret); }
/* * __wt_txn_log_op -- * Write the last logged operation into the in-memory buffer. */ int __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { WT_ITEM *logrec; WT_TXN *txn; WT_TXN_OP *op; txn = &session->txn; if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED) || F_ISSET(session, WT_SESSION_NO_LOGGING) || F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING)) return (0); /* We'd better have a transaction. */ WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING) && F_ISSET(txn, WT_TXN_HAS_ID)); WT_ASSERT(session, txn->mod_count > 0); op = txn->mod + txn->mod_count - 1; WT_RET(__txn_logrec_init(session)); logrec = txn->logrec; switch (op->type) { case WT_TXN_OP_BASIC: return (__txn_op_log(session, logrec, op, cbt)); case WT_TXN_OP_INMEM: case WT_TXN_OP_REF: /* Nothing to log, we're done. */ return (0); case WT_TXN_OP_TRUNCATE_COL: return (__wt_logop_col_truncate_pack(session, logrec, op->fileid, op->u.truncate_col.start, op->u.truncate_col.stop)); case WT_TXN_OP_TRUNCATE_ROW: return (__wt_logop_row_truncate_pack(session, txn->logrec, op->fileid, &op->u.truncate_row.start, &op->u.truncate_row.stop, (uint32_t)op->u.truncate_row.mode)); WT_ILLEGAL_VALUE(session); } /* NOTREACHED */ }
/* * __wt_debug_disk -- * Dump a disk page in debugging mode. */ int __wt_debug_disk( WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, const char *ofile) { WT_DBG *ds, _ds; WT_DECL_RET; ds = &_ds; WT_RET(__debug_config(session, ds, ofile)); __dmsg(ds, "%s page", __wt_page_type_string(dsk->type)); switch (dsk->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: __dmsg(ds, ", recno %" PRIu64, dsk->recno); /* FALLTHROUGH */ case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: __dmsg(ds, ", entries %" PRIu32 "\n", dsk->u.entries); break; case WT_PAGE_OVFL: __dmsg(ds, ", datalen %" PRIu32 "\n", dsk->u.datalen); break; WT_ILLEGAL_VALUE(session); } switch (dsk->type) { case WT_PAGE_COL_FIX: __debug_dsk_col_fix(ds, dsk); break; case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: ret = __debug_dsk_cell(ds, dsk); break; default: break; } __dmsg_wrapup(ds); return (ret); }
/* * __wt_cursor_key_order_init -- * Initialize key ordering checks for cursor movements after a successful * search. */ int __wt_cursor_key_order_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { /* * Cursor searches set the position for cursor movements, set the * last-key value for diagnostic checking. */ switch (cbt->ref->page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: cbt->lastrecno = cbt->recno; return (0); case WT_PAGE_ROW_LEAF: return (__wt_buf_set(session, cbt->lastkey, cbt->iface.key.data, cbt->iface.key.size)); WT_ILLEGAL_VALUE(session); } /* NOTREACHED */ }
/* * __merge_walk -- * Visit all of the child references in a locked subtree and apply a * callback function to them. */ static int __merge_walk(WT_SESSION_IMPL *session, WT_PAGE *page, u_int depth, void (*visit)(WT_PAGE *, WT_REF *, WT_VISIT_STATE *), WT_VISIT_STATE *state) { WT_PAGE *child; WT_REF *ref; uint32_t i; if (depth > state->maxdepth) state->maxdepth = depth; WT_REF_FOREACH(page, ref, i) switch (ref->state) { case WT_REF_LOCKED: child = ref->page; /* * Visit internal pages recursively. This must match * the walk in __rec_review: if the merge succeeds, we * have to unlock everything. */ if (child->type == page->type && __wt_btree_mergeable(child)) { WT_RET(__merge_walk( session, child, depth + 1, visit, state)); break; } /* FALLTHROUGH */ case WT_REF_DELETED: case WT_REF_DISK: (*visit)(page, ref, state); break; case WT_REF_EVICT_WALK: case WT_REF_MEM: case WT_REF_READING: WT_ILLEGAL_VALUE(session); } return (0); }
int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out) { uint32_t optype, opsize; /* Peek at the size and the type. */ WT_RET(__wt_logop_read(session, pp, end, &optype, &opsize)); end = *pp + opsize; switch (optype) { case WT_LOGOP_COL_PUT: WT_RET(__wt_logop_col_put_print(session, pp, end, out)); break; case WT_LOGOP_COL_REMOVE: WT_RET(__wt_logop_col_remove_print(session, pp, end, out)); break; case WT_LOGOP_COL_TRUNCATE: WT_RET(__wt_logop_col_truncate_print(session, pp, end, out)); break; case WT_LOGOP_ROW_PUT: WT_RET(__wt_logop_row_put_print(session, pp, end, out)); break; case WT_LOGOP_ROW_REMOVE: WT_RET(__wt_logop_row_remove_print(session, pp, end, out)); break; case WT_LOGOP_ROW_TRUNCATE: WT_RET(__wt_logop_row_truncate_print(session, pp, end, out)); break; WT_ILLEGAL_VALUE(session); } return (0); }
/* * __wt_curbulk_init -- * Initialize a bulk cursor. */ int __wt_curbulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool bitmap, bool skip_sort_check) { WT_CURSOR *c; WT_CURSOR_BTREE *cbt; c = &cbulk->cbt.iface; cbt = &cbulk->cbt; /* Bulk cursors only support insert and close (reset is a no-op). */ __wt_cursor_set_notsup(c); switch (cbt->btree->type) { case BTREE_COL_FIX: c->insert = bitmap ? __curbulk_insert_fix_bitmap : __curbulk_insert_fix; break; case BTREE_COL_VAR: c->insert = __curbulk_insert_var; break; case BTREE_ROW: /* * Row-store order comparisons are expensive, so we optionally * skip them when we know the input is correct. */ c->insert = skip_sort_check ? __curbulk_insert_row_skip_check : __curbulk_insert_row; break; WT_ILLEGAL_VALUE(session); } cbulk->first_insert = true; cbulk->recno = 0; cbulk->bitmap = bitmap; if (bitmap) F_SET(c, WT_CURSTD_RAW); return (__wt_bulk_init(session, cbulk)); }
/* * __meta_track_apply -- * Apply the changes in a metadata tracking record. */ static int __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; int tret; switch (trk->op) { case WT_ST_EMPTY: /* Unused slot */ break; case WT_ST_CHECKPOINT: /* Checkpoint, see above */ btree = trk->dhandle->handle; bm = btree->bm; WT_WITH_DHANDLE(session, trk->dhandle, WT_TRET(bm->checkpoint_resolve(bm, session))); break; case WT_ST_DROP_COMMIT: if ((tret = __wt_remove_if_exists(session, trk->a)) != 0) { __wt_err(session, tret, "metadata remove dropped file %s", trk->a); WT_TRET(tret); } break; case WT_ST_LOCK: WT_WITH_DHANDLE(session, trk->dhandle, WT_TRET(__wt_session_release_btree(session))); break; case WT_ST_FILEOP: case WT_ST_REMOVE: case WT_ST_SET: break; WT_ILLEGAL_VALUE(session); } __meta_track_clear(session, trk); return (ret); }
/* * __merge_promote_key -- * Copy a key from a child page into the reference in its parent, so it * can be found by searches. */ static int __merge_promote_key(WT_SESSION_IMPL *session, WT_REF *ref) { WT_PAGE *page; WT_REF *child_ref; size_t size; void *p; page = ref->page; switch (page->type) { case WT_PAGE_COL_INT: child_ref = &page->u.intl.t[0]; ref->key.recno = page->u.intl.recno = child_ref->key.recno; return (0); case WT_PAGE_ROW_INT: child_ref = &page->u.intl.t[0]; __wt_ref_key(child_ref->page, child_ref, &p, &size); return (__wt_row_ikey_incr( session, page, 0, p, size, &ref->key.ikey)); WT_ILLEGAL_VALUE(session); } }
/* * __merge_promote_key -- * Copy a key from a child page into the reference in its parent, so it * can be found by searches. */ static int __merge_promote_key(WT_SESSION_IMPL *session, WT_REF *ref) { WT_IKEY *ikey; WT_PAGE *page; WT_REF *child_ref; page = ref->page; switch (page->type) { case WT_PAGE_COL_INT: child_ref = &page->u.intl.t[0]; ref->u.recno = page->u.intl.recno = child_ref->u.recno; return (0); case WT_PAGE_ROW_INT: child_ref = &page->u.intl.t[0]; ikey = child_ref->u.key; WT_ASSERT(session, ikey != NULL); return (__wt_row_ikey_incr(session, page, 0, WT_IKEY_DATA(ikey), ikey->size, &ref->u.key)); WT_ILLEGAL_VALUE(session); } }
/* * __wt_page_inmem -- * Build in-memory page information. */ int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep) { WT_DECL_RET; WT_PAGE *page; const WT_PAGE_HEADER *dsk; uint32_t alloc_entries; size_t size; *pagep = NULL; dsk = image; alloc_entries = 0; /* * Figure out how many underlying objects the page references so we can * allocate them along with the page. */ switch (dsk->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: /* * Column-store leaf page entries map one-to-one to the number * of physical entries on the page (each physical entry is a * value item). * * Column-store internal page entries map one-to-one to the * number of physical entries on the page (each entry is a * location cookie). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_ROW_INT: /* * Row-store internal page entries map one-to-two to the number * of physical entries on the page (each entry is a key and * location cookie pair). */ alloc_entries = dsk->u.entries / 2; break; case WT_PAGE_ROW_LEAF: /* * If the "no empty values" flag is set, row-store leaf page * entries map one-to-one to the number of physical entries * on the page (each physical entry is a key or value item). * If that flag is not set, there are more keys than values, * we have to walk the page to figure it out. */ if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL)) alloc_entries = dsk->u.entries; else if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE)) alloc_entries = dsk->u.entries / 2; else WT_RET(__inmem_row_leaf_entries( session, dsk, &alloc_entries)); break; WT_ILLEGAL_VALUE(session); } /* Allocate and initialize a new WT_PAGE. */ WT_RET(__wt_page_alloc( session, dsk->type, dsk->recno, alloc_entries, 1, &page)); page->dsk = dsk; F_SET_ATOMIC(page, flags); /* * Track the memory allocated to build this page so we can update the * cache statistics in a single call. */ size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0; switch (page->type) { case WT_PAGE_COL_FIX: __inmem_col_fix(session, page); break; case WT_PAGE_COL_INT: __inmem_col_int(session, page); break; case WT_PAGE_COL_VAR: WT_ERR(__inmem_col_var(session, page, &size)); break; case WT_PAGE_ROW_INT: WT_ERR(__inmem_row_int(session, page, &size)); break; case WT_PAGE_ROW_LEAF: WT_ERR(__inmem_row_leaf(session, page)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Update the page's in-memory size and the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); /* Link the new internal page to the parent. */ if (ref != NULL) { switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: page->pg_intl_parent_ref = ref; break; } ref->page = page; } *pagep = page; return (0); err: __wt_page_out(session, &page); return (ret); }
/* * __wt_verify_dsk_image -- * Verify a single block as read from disk. */ int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok) { const uint8_t *p, *end; u_int i; uint8_t flags; /* Check the page type. */ switch (dsk->type) { case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: case WT_PAGE_OVFL: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: break; case WT_PAGE_INVALID: default: WT_RET_VRFY(session, "page at %s has an invalid type of %" PRIu32, tag, dsk->type); } /* Check the page record number. */ switch (dsk->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: if (dsk->recno != 0) break; WT_RET_VRFY(session, "%s page at %s has a record number of zero", __wt_page_type_string(dsk->type), tag); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: if (dsk->recno == 0) break; WT_RET_VRFY(session, "%s page at %s has a non-zero record number", __wt_page_type_string(dsk->type), tag); } /* Check the page flags. */ flags = dsk->flags; if (LF_ISSET(WT_PAGE_COMPRESSED)) LF_CLR(WT_PAGE_COMPRESSED); if (LF_ISSET(WT_PAGE_ENCRYPTED)) LF_CLR(WT_PAGE_ENCRYPTED); if (dsk->type == WT_PAGE_ROW_LEAF) { if (LF_ISSET(WT_PAGE_EMPTY_V_ALL) && LF_ISSET(WT_PAGE_EMPTY_V_NONE)) WT_RET_VRFY(session, "page at %s has invalid flags combination: 0x%" PRIx8, tag, dsk->flags); if (LF_ISSET(WT_PAGE_EMPTY_V_ALL)) LF_CLR(WT_PAGE_EMPTY_V_ALL); if (LF_ISSET(WT_PAGE_EMPTY_V_NONE)) LF_CLR(WT_PAGE_EMPTY_V_NONE); } if (flags != 0) WT_RET_VRFY(session, "page at %s has invalid flags set: 0x%" PRIx8, tag, flags); /* Unused bytes */ for (p = dsk->unused, i = sizeof(dsk->unused); i > 0; --i) if (*p != '\0') WT_RET_VRFY(session, "page at %s has non-zero unused page header bytes", tag); /* * Any bytes after the data chunk should be nul bytes; ignore if the * size is 0, that allows easy checking of disk images where we don't * have the size. */ if (size != 0) { p = (uint8_t *)dsk + dsk->mem_size; end = (uint8_t *)dsk + size; for (; p < end; ++p) if (*p != '\0') WT_RET_VRFY(session, "%s page at %s has non-zero trailing bytes", __wt_page_type_string(dsk->type), tag); } /* Check for empty pages, then verify the items on the page. */ switch (dsk->type) { case WT_PAGE_COL_INT: case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: if (!empty_page_ok && dsk->u.entries == 0) WT_RET_VRFY(session, "%s page at %s has no entries", __wt_page_type_string(dsk->type), tag); break; case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: if (dsk->u.datalen == 0) WT_RET_VRFY(session, "%s page at %s has no data", __wt_page_type_string(dsk->type), tag); break; } switch (dsk->type) { case WT_PAGE_COL_INT: return (__verify_dsk_col_int(session, tag, dsk)); case WT_PAGE_COL_FIX: return (__verify_dsk_col_fix(session, tag, dsk)); case WT_PAGE_COL_VAR: return (__verify_dsk_col_var(session, tag, dsk)); case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: return (__verify_dsk_row(session, tag, dsk)); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: return (__verify_dsk_chunk(session, tag, dsk, dsk->u.datalen)); WT_ILLEGAL_VALUE(session); } /* NOTREACHED */ }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; WT_PAGE *page; int busy, force_attempts, oldgen; for (force_attempts = oldgen = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, attempt to read it. * Make sure there is space in the cache. */ WT_RET(__wt_cache_full_check(session)); WT_RET(__wt_cache_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* FALLTHROUGH */ case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* The page is busy -- wait. */ break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory: get a hazard pointer, update * the page's LRU and return. The expected reason we * can't get a hazard pointer is because the page is * being evicted; yield and try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) break; page = ref->page; WT_ASSERT(session, page != NULL); /* Forcibly evict pages that are too big. */ if (!LF_ISSET(WT_READ_NO_EVICT) && force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; WT_RET(__wt_page_release(session, ref, flags)); break; } /* Check if we need an autocommit transaction. */ if ((ret = __wt_txn_autocommit_check(session)) != 0) { WT_TRET(__wt_hazard_clear(session, page)); return (ret); } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); return (0); WT_ILLEGAL_VALUE(session); } /* We failed to get the page -- yield before retrying. */ __wt_yield(); } }
/* * __wt_verify_dsk -- * Verify a single Btree page as read from disk. */ int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf) { WT_PAGE_HEADER *dsk; uint32_t size; uint8_t *p, *end; u_int i; dsk = buf->mem; size = buf->size; /* Check the page type. */ switch (dsk->type) { case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: case WT_PAGE_OVFL: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: break; case WT_PAGE_INVALID: default: WT_RET_VRFY(session, "page at %s has an invalid type of %" PRIu32, addr, dsk->type); } /* Check the page record number. */ switch (dsk->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: if (dsk->recno != 0) break; WT_RET_VRFY(session, "%s page at %s has a record number of zero", __wt_page_type_string(dsk->type), addr); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: if (dsk->recno == 0) break; WT_RET_VRFY(session, "%s page at %s has a non-zero record number", __wt_page_type_string(dsk->type), addr); } /* Check the page flags. */ switch (dsk->flags) { case 0: case WT_PAGE_COMPRESSED: break; default: WT_RET_VRFY(session, "page at %s has an invalid flags value of 0x%" PRIx32, addr, (uint32_t)dsk->flags); } /* Unused bytes */ for (p = dsk->unused, i = sizeof(dsk->unused); i > 0; --i) if (*p != '\0') WT_RET_VRFY(session, "page at %s has non-zero unused page header bytes", addr); /* Any bytes after the data chunk should be nul bytes. */ p = (uint8_t *)dsk + dsk->mem_size; end = (uint8_t *)dsk + size; for (; p < end; ++p) if (*p != '\0') WT_RET_VRFY(session, "%s page at %s has non-zero trailing bytes", __wt_page_type_string(dsk->type), addr); /* Verify the items on the page. */ switch (dsk->type) { case WT_PAGE_COL_INT: return (__verify_dsk_col_int(session, addr, dsk)); case WT_PAGE_COL_FIX: return (__verify_dsk_col_fix(session, addr, dsk)); case WT_PAGE_COL_VAR: return (__verify_dsk_col_var(session, addr, dsk)); case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: return (__verify_dsk_row(session, addr, dsk)); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: return (__verify_dsk_chunk(session, addr, dsk, dsk->u.datalen)); WT_ILLEGAL_VALUE(session); } /* NOTREACHED */ }
/* * __curjoin_entry_in_range -- * Check if a key is in the range specified by the entry, returning * WT_NOTFOUND if not. */ static int __curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, WT_ITEM *curkey, WT_CURSOR_JOIN_ITER *iter) { WT_COLLATOR *collator; WT_CURSOR_JOIN_ENDPOINT *end, *endmax; u_int pos; int cmp; bool disjunction, passed; collator = (entry->index != NULL) ? entry->index->collator : NULL; endmax = &entry->ends[entry->ends_next]; disjunction = F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION); /* * The iterator may have already satisfied some endpoint conditions. * If so and we're a disjunction, we're done. If so and we're a * conjunction, we can start past the satisfied conditions. */ if (iter == NULL) pos = 0; else { if (disjunction && iter->end_skip) return (0); pos = iter->end_pos + iter->end_skip; } for (end = &entry->ends[pos]; end < endmax; end++) { WT_RET(__wt_compare(session, collator, curkey, &end->key, &cmp)); switch (WT_CURJOIN_END_RANGE(end)) { case WT_CURJOIN_END_EQ: passed = (cmp == 0); break; case WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ: passed = (cmp >= 0); WT_ASSERT(session, iter == NULL); break; case WT_CURJOIN_END_GT: passed = (cmp > 0); if (passed && iter != NULL && pos == 0) iter->end_skip = 1; break; case WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ: passed = (cmp <= 0); break; case WT_CURJOIN_END_LT: passed = (cmp < 0); break; WT_ILLEGAL_VALUE(session, WT_CURJOIN_END_RANGE(end)); } if (!passed) { if (iter != NULL && (iter->is_equal || F_ISSET(end, WT_CURJOIN_END_LT))) { WT_RET(__curjoin_iter_bump(iter)); return (WT_NOTFOUND); } if (!disjunction) return (WT_NOTFOUND); iter = NULL; } else if (disjunction) break; } if (disjunction && end == endmax) return (WT_NOTFOUND); return (0); }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_ITEM *tmp; WT_PAGE *page; WT_ROW *rip; uint8_t v; btree = S2BT(session); page = cbt->ref->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the value from the original page. */ v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the value from the original page cell. */ cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->pg_row_d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take its key. * Else, if we have an exact match, we copied the key in the * search function, take it from there. * If we don't have an exact match, take the key from the * original page. */ if (cbt->ins != NULL) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else if (cbt->compare == 0) { /* * If not in an insert list and there's an exact match, * the row-store search function built the key we want * to return in the cursor's temporary buffer. Swap the * cursor's search-key and temporary buffers so we can * return it (it's unsafe to return the temporary buffer * itself because our caller might do another search in * this table using the key we return, and we'd corrupt * the search key during any subsequent search that used * the temporary buffer. */ tmp = cbt->row_key; cbt->row_key = cbt->tmp; cbt->tmp = tmp; cursor->key.data = cbt->row_key->data; cursor->key.size = cbt->row_key->size; } else WT_RET(__wt_row_leaf_key( session, page, rip, &cursor->key, false)); /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Simple values have their location encoded in the WT_ROW. */ if (__wt_row_leaf_value(page, rip, &cursor->value)) return (0); /* * Take the value from the original page cell (which may be * empty). */ if ((cell = __wt_row_leaf_value_cell(page, rip, NULL)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* The value is an on-page cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, &unpack); WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value)); return (0); }
/* * __wt_ovfl_cache -- * Handle deletion of an overflow value. */ int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack) { int visible; /* * This function solves a problem in reconciliation. The scenario is: * - reconciling a leaf page that references an overflow item * - the item is updated and the update committed * - a checkpoint runs, freeing the backing overflow blocks * - a snapshot transaction wants the original version of the item * * In summary, we may need the original version of an overflow item for * a snapshot transaction after the item was deleted from a page that's * subsequently been checkpointed, where the checkpoint must know about * the freed blocks. We don't have any way to delay a free of the * underlying blocks until a particular set of transactions exit (and * this shouldn't be a common scenario), so cache the overflow value in * memory. * * This gets hard because the snapshot transaction reader might: * - search the WT_UPDATE list and not find an useful entry * - read the overflow value's address from the on-page cell * - go to sleep * - checkpoint runs, caches the overflow value, frees the blocks * - another thread allocates and overwrites the blocks * - the reader wakes up and reads the wrong value * * Use a read/write lock and the on-page cell to fix the problem: hold * a write lock when changing the cell type from WT_CELL_VALUE_OVFL to * WT_CELL_VALUE_OVFL_RM and hold a read lock when reading an overflow * item. * * The read/write lock is per btree, but it could be per page or even * per overflow item. We don't do any of that because overflow values * are supposed to be rare and we shouldn't see contention for the lock. * * Check for a globally visible update. If there is a globally visible * update, we don't need to cache the item because it's not possible for * a running thread to have moved past it. */ switch (page->type) { case WT_PAGE_COL_VAR: visible = __ovfl_cache_col_visible(session, cookie, vpack); break; case WT_PAGE_ROW_LEAF: visible = __ovfl_cache_row_visible(session, page, cookie); break; WT_ILLEGAL_VALUE(session); } /* * If there's no globally visible update, there's a reader in the system * that might try and read the old value, cache it. */ if (!visible) { WT_RET(__ovfl_cache(session, page, vpack)); WT_STAT_FAST_DATA_INCR(session, cache_overflow_value); } /* * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the * underlying overflow value's blocks to be freed when reconciliation * completes. */ return (__wt_ovfl_discard_add(session, page, vpack->cell)); }
int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_PAGE *page; WT_ROW *rip; uint8_t v; switch (page->type){ case WT_PAGE_COL_FIX: cursor->recno = cbt->recno; /*cursor对应的是一个upd,直接返回value*/ if (upd != NULL){ cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return 0; } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return __wt_buf_set(session, &cursor->value, &v, 1); case WT_PAGE_COL_VAR: cursor->recno = cbt->recno; if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /*获得对应的cell,并通过cell得到K/V值*/ cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->pg_row_d[cbt->slot]; if (cbt->ins != NULL){ /*插入的k/v对*/ cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else if (cbt->compare == 0){/*比较器定位到了对应的k/v对*/ cursor->key.data = cbt->search_key.data; cursor->key.size = cbt->search_key.size; } else WT_RET(__wt_row_leaf_key(session, page, rip, &cursor->key, 0)); /*设置key的值*/ /*值是在append/update list当中,从当中取*/ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /*可以直接通过rip指针获得value,K/V是存储在cell空间之内*/ if (__wt_row_leaf_value(page, rip, &cursor->value)) return 0; /*不是连续存储的,需要通过解析cell来定位到value*/ if (cell = __wt_row_leaf_value_cell(page, rip, NULL) == NULL){ cursor->value.size = 0; return 0; } break; WT_ILLEGAL_VALUE(session); } /*通过cell解析到对应的value值, ovfl item*/ __wt_cell_unpack(cell, &unpack); WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value)); return 0; }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; int busy, cache_work, force_attempts, oldgen, stalled; btree = S2BT(session); stalled = 0; for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, read it. If this thread is * allowed to do eviction work, check for space in the * cache. */ if (!LF_ISSET(WT_READ_NO_EVICT)) WT_RET(__wt_cache_eviction_check( session, 1, NULL)); WT_RET(__page_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on another thread's read, stall. */ WT_STAT_FAST_CONN_INCR(session, page_read_blocked); stalled = 1; break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on eviction, stall. */ WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); stalled = 1; break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory. * * Get a hazard pointer if one is required. We cannot * be evicting if no hazard pointer is required, we're * done. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) goto skip_evict; /* * The expected reason we can't get a hazard pointer is * because the page is being evicted, yield, try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } /* * If eviction is configured for this file, check to see * if the page qualifies for forced eviction and update * the page's generation number. If eviction isn't being * done on this file, we're done. */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION)) goto skip_evict; /* * Forcibly evict pages that are too big. */ page = ref->page; if (force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); stalled = 1; break; } WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen != WT_READGEN_OLDEST && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_bump(session); skip_evict: /* * Check if we need an autocommit transaction. * Starting a transaction can trigger eviction, so skip * it if eviction isn't permitted. */ return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : __wt_txn_autocommit_check(session)); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (stalled) wait_cnt += 1000; else if (++wait_cnt < 1000) { __wt_yield(); continue; } /* * If stalling and this thread is allowed to do eviction work, * check if the cache needs help. If we do work for the cache, * substitute that for a sleep. */ if (!LF_ISSET(WT_READ_NO_EVICT)) { WT_RET( __wt_cache_eviction_check(session, 1, &cache_work)); if (cache_work) continue; } sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000); WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } }