/* * __cursor_invalid -- * Return if the cursor references an invalid K/V pair (either the pair * doesn't exist at all because the tree is empty, or the pair was deleted). */ static inline int __cursor_invalid(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_INSERT *ins; WT_PAGE *page; btree = cbt->btree; ins = cbt->ins; page = cbt->page; /* If we found an item on an insert list, check there. */ if (ins != NULL) return (WT_UPDATE_DELETED_ISSET(ins->upd) ? 1 : 0); /* The page may be empty, the search routine doesn't check. */ if (page->entries == 0) return (1); /* Otherwise, check for an update in the page's slots. */ switch (btree->type) { case BTREE_COL_FIX: break; case BTREE_COL_VAR: cip = &page->u.col_var.d[cbt->slot]; if ((cell = WT_COL_PTR(page, cip)) == NULL) return (WT_NOTFOUND); __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) return (1); break; case BTREE_ROW: if (page->u.row.upd != NULL && page->u.row.upd[cbt->slot] != NULL && WT_UPDATE_DELETED_ISSET(page->u.row.upd[cbt->slot])) return (1); break; } return (0); }
/* * __wt_ovfl_discard -- * Discard an on-page overflow value, and reset the page's cell. */ int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) { WT_BM *bm; WT_BTREE *btree; WT_CELL_UNPACK *unpack, _unpack; btree = S2BT(session); bm = btree->bm; unpack = &_unpack; __wt_cell_unpack(cell, unpack); /* * Finally remove overflow key/value objects, called when reconciliation * finishes after successfully writing a page. * * Keys must have already been instantiated and value objects must have * already been cached (if they might potentially still be read by any * running transaction). * * Acquire the overflow lock to avoid racing with a thread reading the * backing overflow blocks. */ __wt_writelock(session, &btree->ovfl_lock); switch (unpack->raw) { case WT_CELL_KEY_OVFL: __wt_cell_type_reset(session, unpack->cell, WT_CELL_KEY_OVFL, WT_CELL_KEY_OVFL_RM); break; case WT_CELL_VALUE_OVFL: __wt_cell_type_reset(session, unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM); break; WT_ILLEGAL_VALUE(session); } __wt_writeunlock(session, &btree->ovfl_lock); /* Free the backing disk blocks. */ return (bm->free(bm, session, unpack->data, unpack->size)); }
/* * __ovfl_discard_verbose -- * Dump information about a discard overflow record. */ static int __ovfl_discard_verbose( WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell, const char *tag) { WT_CELL_UNPACK *unpack, _unpack; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_RET(__wt_scr_alloc(session, 512, &tmp)); unpack = &_unpack; __wt_cell_unpack(cell, unpack); WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW, "discard: %s%s%p %s", tag == NULL ? "" : tag, tag == NULL ? "" : ": ", page, __wt_addr_string(session, unpack->data, unpack->size, tmp))); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __cursor_var_prev -- * Move to the previous, variable-length column-store item. */ static inline int __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_INSERT *ins; WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; uint64_t rle_start; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; val = &cbt->iface.value; rle_start = 0; /* -Werror=maybe-uninitialized */ /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_var_last_recno(page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { __cursor_set_recno(cbt, cbt->recno - 1); new_page: if (cbt->recno < page->pg_var_recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ if ((cip = __col_var_search(page, cbt->recno, &rle_start)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) { if (__wt_cell_rle(&unpack) == 1) continue; /* * There can be huge gaps in the variable-length * column-store name space appearing as deleted * records. If more than one deleted record, do * the work of finding the next record to return * instead of looping through the records. * * First, find the largest record in the update * list that's smaller than the current record. */ ins = __col_insert_search_lt( cbt->ins_head, cbt->recno); /* * Second, for records with RLEs greater than 1, * the above call to __col_var_search located * this record in the page's list of repeating * records, and returned the starting record. * The starting record - 1 is the record to * which we could skip, if there was no larger * record in the update list. */ cbt->recno = rle_start - 1; if (ins != NULL && WT_INSERT_RECNO(ins) > cbt->recno) cbt->recno = WT_INSERT_RECNO(ins); /* Adjust for the outer loop decrement. */ ++cbt->recno; continue; } WT_RET(__wt_page_cell_data_ref( session, page, &unpack, cbt->tmp)); cbt->cip_saved = cip; } val->data = cbt->tmp->data; val->size = cbt->tmp->size; return (0); } /* NOTREACHED */ }
/* * __split_ref_deepen_move -- * Move a WT_REF from a parent to a child in service of a split to deepen * the tree, including updating the accounting information. */ static int __split_ref_deepen_move(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp) { WT_ADDR *addr; WT_CELL_UNPACK unpack; WT_DECL_RET; WT_IKEY *ikey; size_t size; void *key; /* * Instantiate row-store keys, and column- and row-store addresses in * the WT_REF structures referenced by a page that's being split (and * deepening the tree). The WT_REF structures aren't moving, but the * index references are moving from the page we're splitting to a set * of child pages, and so we can no longer reference the block image * that remains with the page being split. * * No locking is required to update the WT_REF structure because we're * the only thread splitting the parent page, and there's no way for * readers to race with our updates of single pointers. The changes * have to be written before the page goes away, of course, our caller * owns that problem. * * Row-store keys, first. */ if (parent->type == WT_PAGE_ROW_INT) { if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) { __wt_ref_key(parent, ref, &key, &size); WT_RET(__wt_row_ikey(session, 0, key, size, ref)); ikey = ref->key.ikey; } else { WT_RET(__split_ovfl_key_cleanup(session, parent, ref)); *parent_decrp += sizeof(WT_IKEY) + ikey->size; } *child_incrp += sizeof(WT_IKEY) + ikey->size; } /* * If there's no address (the page has never been written), or the * address has been instantiated, there's no work to do. Otherwise, * get the address from the on-page cell. */ addr = ref->addr; if (addr != NULL && !__wt_off_page(parent, addr)) { __wt_cell_unpack((WT_CELL *)ref->addr, &unpack); WT_RET(__wt_calloc_one(session, &addr)); if ((ret = __wt_strndup( session, unpack.data, unpack.size, &addr->addr)) != 0) { __wt_free(session, addr); return (ret); } addr->size = (uint8_t)unpack.size; addr->type = unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF; ref->addr = addr; } /* And finally, the WT_REF itself. */ WT_MEM_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF)); return (0); }
/* * __cursor_var_prev -- * Move to the previous, variable-length column-store item. */ static inline int __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_var_last_recno(page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { __cursor_set_recno(cbt, cbt->recno - 1); new_page: if (cbt->recno < page->pg_var_recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ if ((cip = __col_var_search(page, cbt->recno)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) continue; WT_RET(__wt_page_cell_data_ref( session, page, &unpack, &cbt->tmp)); cbt->cip_saved = cip; } val->data = cbt->tmp.data; val->size = cbt->tmp.size; return (0); } /* NOTREACHED */ }
int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_PAGE *page; WT_ROW *rip; uint8_t v; switch (page->type){ case WT_PAGE_COL_FIX: cursor->recno = cbt->recno; /*cursor对应的是一个upd,直接返回value*/ if (upd != NULL){ cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return 0; } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return __wt_buf_set(session, &cursor->value, &v, 1); case WT_PAGE_COL_VAR: cursor->recno = cbt->recno; if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /*获得对应的cell,并通过cell得到K/V值*/ cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->pg_row_d[cbt->slot]; if (cbt->ins != NULL){ /*插入的k/v对*/ cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else if (cbt->compare == 0){/*比较器定位到了对应的k/v对*/ cursor->key.data = cbt->search_key.data; cursor->key.size = cbt->search_key.size; } else WT_RET(__wt_row_leaf_key(session, page, rip, &cursor->key, 0)); /*设置key的值*/ /*值是在append/update list当中,从当中取*/ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /*可以直接通过rip指针获得value,K/V是存储在cell空间之内*/ if (__wt_row_leaf_value(page, rip, &cursor->value)) return 0; /*不是连续存储的,需要通过解析cell来定位到value*/ if (cell = __wt_row_leaf_value_cell(page, rip, NULL) == NULL){ cursor->value.size = 0; return 0; } break; WT_ILLEGAL_VALUE(session); } /*通过cell解析到对应的value值, ovfl item*/ __wt_cell_unpack(cell, &unpack); WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value)); return 0; }
/*移向下条variable-length column-store 记录*/ static inline int __cursor_var_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_ITEM *val; WT_INSERT *ins; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; uint64_t rle, rle_start; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; val = &cbt->iface.value; rle_start = 0; /* -Werror=maybe-uninitialized */ if (newpage){ cbt->last_standard_recno = __col_var_last_recno(page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, page->pg_var_recno); goto new_page; } for (;;){ if (cbt->recno >= cbt->last_standard_recno) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->recno + 1); new_page: /*定位到recno对应的WT_COL slot*/ if ((cip = __col_var_search(page, cbt->recno, &rle_start)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); /*读取内容值*/ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. * upd == NULL, 记录可能被删除放入到了insert列表中,slot可能被重用了,那么需要进行cell unpack取值 */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) { if ((rle = __wt_cell_rle(&unpack)) == 1) continue; /*定位到修改列表中的记录*/ ins = __col_insert_search_gt(cbt->ins_head, cbt->recno); cbt->recno = rle_start + rle; if (ins != NULL && WT_INSERT_RECNO(ins) < cbt->recno) cbt->recno = WT_INSERT_RECNO(ins); /* Adjust for the outer loop increment. */ --cbt->recno; continue; } /*取出cell中的值到tmp中*/ WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cbt->tmp)); cbt->cip_saved = cip; } val->data = cbt->tmp.data; val->size = cbt->tmp.size; return 0; } }
/* * __verify_tree -- * Verify a tree, recursively descending through it in depth-first fashion. * The page argument was physically verified (so we know it's correctly formed), * and the in-memory version built. Our job is to check logical relationships * in the page and in the tree. */ static int __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs) { WT_BM *bm; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_DECL_RET; WT_PAGE *page; WT_REF *child_ref; uint64_t recno; uint32_t entry, i; bool found; bm = S2BT(session)->bm; page = ref->page; unpack = &_unpack; WT_CLEAR(*unpack); /* -Wuninitialized */ WT_RET(__wt_verbose(session, WT_VERB_VERIFY, "%s %s", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type))); /* Optionally dump the address. */ if (vs->dump_address) WT_RET(__wt_msg(session, "%s %s", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type))); /* Track the shape of the tree. */ if (WT_PAGE_IS_INTERNAL(page)) ++vs->depth_internal[ WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)]; else ++vs->depth_leaf[ WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)]; /* * The page's physical structure was verified when it was read into * memory by the read server thread, and then the in-memory version * of the page was built. Now we make sure the page and tree are * logically consistent. * * !!! * The problem: (1) the read server has to build the in-memory version * of the page because the read server is the thread that flags when * any thread can access the page in the tree; (2) we can't build the * in-memory version of the page until the physical structure is known * to be OK, so the read server has to verify at least the physical * structure of the page; (3) doing complete page verification requires * reading additional pages (for example, overflow keys imply reading * overflow pages in order to test the key's order in the page); (4) * the read server cannot read additional pages because it will hang * waiting on itself. For this reason, we split page verification * into a physical verification, which allows the in-memory version * of the page to be built, and then a subsequent logical verification * which happens here. * * Report progress occasionally. */ #define WT_VERIFY_PROGRESS_INTERVAL 100 if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0) WT_RET(__wt_progress(session, NULL, vs->fcnt)); #ifdef HAVE_DIAGNOSTIC /* Optionally dump the blocks or page in debugging mode. */ if (vs->dump_blocks) WT_RET(__wt_debug_disk(session, page->dsk, NULL)); if (vs->dump_pages) WT_RET(__wt_debug_page(session, page, NULL)); #endif /* * Column-store key order checks: check the page's record number and * then update the total record count. */ switch (page->type) { case WT_PAGE_COL_FIX: recno = page->pg_fix_recno; goto recno_chk; case WT_PAGE_COL_INT: recno = page->pg_intl_recno; goto recno_chk; case WT_PAGE_COL_VAR: recno = page->pg_var_recno; recno_chk: if (recno != vs->record_total + 1) WT_RET_MSG(session, WT_ERROR, "page at %s has a starting record of %" PRIu64 " when the expected starting record is %" PRIu64, __wt_page_addr_string(session, ref, vs->tmp1), recno, vs->record_total + 1); break; } switch (page->type) { case WT_PAGE_COL_FIX: vs->record_total += page->pg_fix_entries; break; case WT_PAGE_COL_VAR: recno = 0; WT_COL_FOREACH(page, cip, i) if ((cell = WT_COL_PTR(page, cip)) == NULL) ++recno; else { __wt_cell_unpack(cell, unpack); recno += __wt_cell_rle(unpack); } vs->record_total += recno; break; } /* * Row-store leaf page key order check: it's a depth-first traversal, * the first key on this page should be larger than any key previously * seen. */ switch (page->type) { case WT_PAGE_ROW_LEAF: WT_RET(__verify_row_leaf_key_order(session, ref, vs)); break; } /* If it's not the root page, unpack the parent cell. */ if (!__wt_ref_is_root(ref)) { __wt_cell_unpack(ref->addr, unpack); /* Compare the parent cell against the page type. */ switch (page->type) { case WT_PAGE_COL_FIX: if (unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_COL_VAR: if (unpack->raw != WT_CELL_ADDR_LEAF && unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_ROW_LEAF: if (unpack->raw != WT_CELL_ADDR_DEL && unpack->raw != WT_CELL_ADDR_LEAF && unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: if (unpack->raw != WT_CELL_ADDR_INT) celltype_err: WT_RET_MSG(session, WT_ERROR, "page at %s, of type %s, is referenced in " "its parent by a cell of type %s", __wt_page_addr_string( session, ref, vs->tmp1), __wt_page_type_string(page->type), __wt_cell_type_string(unpack->raw)); break; } } /* * Check overflow pages. We check overflow cells separately from other * tests that walk the page as it's simpler, and I don't care much how * fast table verify runs. */ switch (page->type) { case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(__verify_overflow_cell(session, ref, &found, vs)); if (__wt_ref_is_root(ref) || page->type == WT_PAGE_ROW_INT) break; /* * Object if a leaf-no-overflow address cell references a page * with overflow keys, but don't object if a leaf address cell * references a page without overflow keys. Reconciliation * doesn't guarantee every leaf page without overflow items will * be a leaf-no-overflow type. */ if (found && unpack->raw == WT_CELL_ADDR_LEAF_NO) WT_RET_MSG(session, WT_ERROR, "page at %s, of type %s and referenced in its " "parent by a cell of type %s, contains overflow " "items", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type), __wt_cell_type_string(WT_CELL_ADDR_LEAF_NO)); break; } /* Check tree connections and recursively descend the tree. */ switch (page->type) { case WT_PAGE_COL_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_INTL_FOREACH_BEGIN(session, page, child_ref) { /* * It's a depth-first traversal: this entry's starting * record number should be 1 more than the total records * reviewed to this point. */ ++entry; if (child_ref->key.recno != vs->record_total + 1) { WT_RET_MSG(session, WT_ERROR, "the starting record number in entry %" PRIu32 " of the column internal page at " "%s is %" PRIu64 " and the expected " "starting record number is %" PRIu64, entry, __wt_page_addr_string( session, child_ref, vs->tmp1), child_ref->key.recno, vs->record_total + 1); } /* Verify the subtree. */ ++vs->depth; WT_RET(__wt_page_in(session, child_ref, 0)); ret = __verify_tree(session, child_ref, vs); WT_TRET(__wt_page_release(session, child_ref, 0)); --vs->depth; WT_RET(ret); __wt_cell_unpack(child_ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); } WT_INTL_FOREACH_END; break; case WT_PAGE_ROW_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_INTL_FOREACH_BEGIN(session, page, child_ref) { /* * It's a depth-first traversal: this entry's starting * key should be larger than the largest key previously * reviewed. * * The 0th key of any internal page is magic, and we * can't test against it. */ ++entry; if (entry != 1) WT_RET(__verify_row_int_key_order( session, page, child_ref, entry, vs)); /* Verify the subtree. */ ++vs->depth; WT_RET(__wt_page_in(session, child_ref, 0)); ret = __verify_tree(session, child_ref, vs); WT_TRET(__wt_page_release(session, child_ref, 0)); --vs->depth; WT_RET(ret); __wt_cell_unpack(child_ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); } WT_INTL_FOREACH_END;
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_ITEM *tmp; WT_PAGE *page; WT_ROW *rip; uint8_t v; btree = S2BT(session); page = cbt->ref->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the value from the original page. */ v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the value from the original page cell. */ cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->pg_row_d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take its key. * Else, if we have an exact match, we copied the key in the * search function, take it from there. * If we don't have an exact match, take the key from the * original page. */ if (cbt->ins != NULL) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else if (cbt->compare == 0) { /* * If not in an insert list and there's an exact match, * the row-store search function built the key we want * to return in the cursor's temporary buffer. Swap the * cursor's search-key and temporary buffers so we can * return it (it's unsafe to return the temporary buffer * itself because our caller might do another search in * this table using the key we return, and we'd corrupt * the search key during any subsequent search that used * the temporary buffer. */ tmp = cbt->row_key; cbt->row_key = cbt->tmp; cbt->tmp = tmp; cursor->key.data = cbt->row_key->data; cursor->key.size = cbt->row_key->size; } else WT_RET(__wt_row_leaf_key( session, page, rip, &cursor->key, false)); /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Simple values have their location encoded in the WT_ROW. */ if (__wt_row_leaf_value(page, rip, &cursor->value)) return (0); /* * Take the value from the original page cell (which may be * empty). */ if ((cell = __wt_row_leaf_value_cell(page, rip, NULL)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* The value is an on-page cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, &unpack); WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value)); return (0); }
/* * __stat_page_col_var -- * Stat a WT_PAGE_COL_VAR page. */ static void __stat_page_col_var( WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) { WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_INSERT *ins; WT_UPDATE *upd; uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt; uint32_t i; bool orig_deleted; unpack = &_unpack; deleted_cnt = entry_cnt = ovfl_cnt = rle_cnt = 0; WT_STAT_INCR(session, stats, btree_column_variable); /* * Walk the page counting regular items, adjusting if the item has been * subsequently deleted or not. This is a mess because 10-item RLE might * have 3 of the items subsequently deleted. Overflow items are harder, * we can't know if an updated item will be an overflow item or not; do * our best, and simply count every overflow item (or RLE set of items) * we see. */ WT_COL_FOREACH(page, cip, i) { if ((cell = WT_COL_PTR(page, cip)) == NULL) { orig_deleted = true; ++deleted_cnt; } else { orig_deleted = false; __wt_cell_unpack(cell, unpack); if (unpack->type == WT_CELL_ADDR_DEL) orig_deleted = true; else { entry_cnt += __wt_cell_rle(unpack); rle_cnt += __wt_cell_rle(unpack) - 1; } if (unpack->ovfl) ++ovfl_cnt; } /* * Walk the insert list, checking for changes. For each insert * we find, correct the original count based on its state. */ WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) { upd = ins->upd; if (WT_UPDATE_DELETED_ISSET(upd)) { if (!orig_deleted) { ++deleted_cnt; --entry_cnt; } } else if (orig_deleted) { --deleted_cnt; ++entry_cnt; } } }
/* * __cursor_var_prev -- * Move to the previous, variable-length column-store item. */ static inline int __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_DECL_RET; WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_last_recno(cbt->page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { __cursor_set_recno(cbt, cbt->recno - 1); new_page: if (cbt->recno < cbt->page->u.col_var.recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ if ((cip = __col_var_search(cbt->page, cbt->recno)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(cbt->page, cip); /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(cbt->page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) continue; val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(cbt->page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) continue; /* * Restart for a variable-length column-store. We could * catch restart higher up the call-stack but there's no * point to it: unlike row-store (where a normal search * path finds cached overflow values), we have to access * the page's reconciliation structures, and that's as * easy here as higher up the stack. */ if ((ret = __wt_cell_unpack_ref( session, &unpack, &cbt->tmp)) == WT_RESTART) ret = __wt_ovfl_cache_col_restart( session, cbt->page, &unpack, &cbt->tmp); WT_RET(ret); cbt->cip_saved = cip; } val->data = cbt->tmp.data; val->size = cbt->tmp.size; return (0); } /* NOTREACHED */ }
/* * __cursor_var_prev -- * Move to the previous, variable-length column-store item. */ static inline int __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_INSERT *ins; WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; uint64_t *recnop; session = (WT_SESSION_IMPL *)cbt->iface.session; recnop = &cbt->iface.recno; val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_last_recno(cbt->page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); cbt->recno = cbt->last_standard_recno; goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { --cbt->recno; new_page: *recnop = cbt->recno; if (cbt->recno < cbt->page->u.col_var.recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ if ((cip = __col_var_search(cbt->page, cbt->recno)) == NULL) return (WT_NOTFOUND); /* Check any insert list for a matching record. */ if ((ins = __col_insert_search_match( WT_COL_UPDATE(cbt->page, cip), cbt->recno)) != NULL && (upd = __wt_txn_read(session, ins->upd)) != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) continue; cbt->ins = ins; val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(cbt->page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); switch (unpack.type) { case WT_CELL_DEL: continue; case WT_CELL_VALUE: if (session->btree->huffman_value == NULL) { cbt->tmp.data = unpack.data; cbt->tmp.size = unpack.size; break; } /* FALLTHROUGH */ default: WT_RET(__wt_cell_unpack_copy( session, &unpack, &cbt->tmp)); } cbt->cip_saved = cip; } val->data = cbt->tmp.data; val->size = cbt->tmp.size; return (0); } /* NOTREACHED */ }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int key_ret) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_CURSOR *cursor; WT_IKEY *ikey; WT_PAGE *page; WT_ROW *rip; WT_UPDATE *upd; uint8_t v; btree = session->btree; unpack = &_unpack; page = cbt->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: if (key_ret) cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL) { upd = cbt->ins->upd; cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: if (key_ret) cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL) { upd = cbt->ins->upd; cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } cell = WT_COL_PTR(page, &page->u.col_var.d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->u.row.d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take the key and * related WT_UPDATE item. Otherwise, take the key from the * original page, and the value from any related WT_UPDATE item, * or the page if the key was never updated. */ if (cbt->ins == NULL) { if (key_ret) { if (__wt_off_page(page, rip->key)) { ikey = rip->key; cursor->key.data = WT_IKEY_DATA(ikey); cursor->key.size = ikey->size; } else WT_RET(__wt_row_key( session, page, rip, &cursor->key)); } upd = WT_ROW_UPDATE(page, rip); } else { if (key_ret) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } upd = cbt->ins->upd; } if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the original cell (which may be empty). */ if ((cell = __wt_row_value(page, rip)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* It's a cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, unpack); if (btree->huffman_value == NULL && unpack->type == WT_CELL_VALUE) { cursor->value.data = unpack->data; cursor->value.size = unpack->size; return (0); } else return (__wt_cell_unpack_copy(session, unpack, &cursor->value)); }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_DECL_RET; WT_PAGE *page; WT_ROW *rip; WT_UPDATE *upd; uint8_t v; btree = S2BT(session); page = cbt->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } cell = WT_COL_PTR(page, &page->u.col_var.d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->u.row.d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take the key and * related WT_UPDATE item. Otherwise, take the key from the * original page, and the value from any related WT_UPDATE item, * or the page if the key was never updated. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else { WT_RET( __wt_row_key(session, page, rip, &cursor->key, 0)); upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); } if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the original cell (which may be empty). */ if ((cell = __wt_row_value(page, rip)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* The value is an on-page cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, &unpack); ret = __wt_cell_unpack_ref(session, &unpack, &cursor->value); /* * Restart for a variable-length column-store. We could catch restart * higher up the call-stack but there's no point to it: unlike row-store * (where a normal search path finds cached overflow values), we have to * access the page's reconciliation structures, and that's as easy here * as higher up the stack. */ if (ret == WT_RESTART && page->type == WT_PAGE_COL_VAR) ret = __wt_ovfl_cache_col_restart( session, page, &unpack, &cursor->value); return (ret); }
/* * __verify_tree -- * Verify a tree, recursively descending through it in depth-first fashion. * The page argument was physically verified (so we know it's correctly formed), * and the in-memory version built. Our job is to check logical relationships * in the page and in the tree. */ static int __verify_tree(WT_SESSION_IMPL *session, WT_PAGE *page, WT_VSTUFF *vs) { WT_BM *bm; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_DECL_RET; WT_REF *ref; uint64_t recno; uint32_t entry, i; int found, lno; bm = S2BT(session)->bm; unpack = &_unpack; WT_VERBOSE_RET(session, verify, "%s %s", __wt_page_addr_string(session, vs->tmp1, page), __wt_page_type_string(page->type)); #ifdef HAVE_DIAGNOSTIC if (vs->dump_address) WT_RET(__wt_msg(session, "%s %s", __wt_page_addr_string(session, vs->tmp1, page), __wt_page_type_string(page->type))); #endif /* * The page's physical structure was verified when it was read into * memory by the read server thread, and then the in-memory version * of the page was built. Now we make sure the page and tree are * logically consistent. * * !!! * The problem: (1) the read server has to build the in-memory version * of the page because the read server is the thread that flags when * any thread can access the page in the tree; (2) we can't build the * in-memory version of the page until the physical structure is known * to be OK, so the read server has to verify at least the physical * structure of the page; (3) doing complete page verification requires * reading additional pages (for example, overflow keys imply reading * overflow pages in order to test the key's order in the page); (4) * the read server cannot read additional pages because it will hang * waiting on itself. For this reason, we split page verification * into a physical verification, which allows the in-memory version * of the page to be built, and then a subsequent logical verification * which happens here. * * Report progress every 10 pages. */ if (++vs->fcnt % 10 == 0) WT_RET(__wt_progress(session, NULL, vs->fcnt)); #ifdef HAVE_DIAGNOSTIC /* Optionally dump the page in debugging mode. */ if (vs->dump_blocks && page->dsk != NULL) WT_RET(__wt_debug_disk(session, page->dsk, NULL)); if (vs->dump_pages) WT_RET(__wt_debug_page(session, page, NULL)); #endif /* * Column-store key order checks: check the page's record number and * then update the total record count. */ switch (page->type) { case WT_PAGE_COL_FIX: recno = page->u.col_fix.recno; goto recno_chk; case WT_PAGE_COL_INT: recno = page->u.intl.recno; goto recno_chk; case WT_PAGE_COL_VAR: recno = page->u.col_var.recno; recno_chk: if (recno != vs->record_total + 1) WT_RET_MSG(session, WT_ERROR, "page at %s has a starting record of %" PRIu64 " when the expected starting record is %" PRIu64, __wt_page_addr_string(session, vs->tmp1, page), recno, vs->record_total + 1); break; } switch (page->type) { case WT_PAGE_COL_FIX: vs->record_total += page->entries; break; case WT_PAGE_COL_VAR: recno = 0; WT_COL_FOREACH(page, cip, i) if ((cell = WT_COL_PTR(page, cip)) == NULL) ++recno; else { __wt_cell_unpack(cell, unpack); recno += __wt_cell_rle(unpack); } vs->record_total += recno; break; } /* * Row-store leaf page key order check: it's a depth-first traversal, * the first key on this page should be larger than any key previously * seen. */ switch (page->type) { case WT_PAGE_ROW_LEAF: WT_RET(__verify_row_leaf_key_order(session, page, vs)); break; } /* * Check overflow pages. We check overflow cells separately from other * tests that walk the page as it's simpler, and I don't care much how * fast table verify runs. * * Object if a leaf-no-overflow address cell references a page that has * overflow keys, but don't object if a standard address cell references * a page without overflow keys. The leaf-no-overflow address cell is * an optimization for trees without few, if any, overflow items, and * may not be set by reconciliation in all possible cases. */ if (WT_PAGE_IS_ROOT(page)) lno = 0; else { __wt_cell_unpack(page->ref->addr, unpack); lno = unpack->raw == WT_CELL_ADDR_LNO ? 1 : 0; } switch (page->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(__verify_overflow_cell(session, page, &found, vs)); if (found && lno) WT_RET_MSG(session, WT_ERROR, "page at %s referenced in its parent by a cell of " "type %s illegally contains overflow items", __wt_page_addr_string(session, vs->tmp1, page), __wt_cell_type_string(WT_CELL_ADDR_LNO)); break; default: if (lno) WT_RET_MSG(session, WT_ERROR, "page at %s is of type %s and is illegally " "referenced in its parent by a cell of type %s", __wt_page_addr_string(session, vs->tmp1, page), __wt_page_type_string(page->type), __wt_cell_type_string(WT_CELL_ADDR_LNO)); break; } /* Check tree connections and recursively descend the tree. */ switch (page->type) { case WT_PAGE_COL_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_REF_FOREACH(page, ref, i) { /* * It's a depth-first traversal: this entry's starting * record number should be 1 more than the total records * reviewed to this point. */ ++entry; if (ref->u.recno != vs->record_total + 1) { __wt_cell_unpack(ref->addr, unpack); WT_RET_MSG(session, WT_ERROR, "the starting record number in entry %" PRIu32 " of the column internal page at " "%s is %" PRIu64 " and the expected " "starting record number is %" PRIu64, entry, __wt_page_addr_string( session, vs->tmp1, page), ref->u.recno, vs->record_total + 1); } /* Verify the subtree. */ WT_RET(__wt_page_in(session, page, ref)); ret = __verify_tree(session, ref->page, vs); WT_TRET(__wt_page_release(session, ref->page)); WT_RET(ret); __wt_cell_unpack(ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); } break; case WT_PAGE_ROW_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_REF_FOREACH(page, ref, i) { /* * It's a depth-first traversal: this entry's starting * key should be larger than the largest key previously * reviewed. * * The 0th key of any internal page is magic, and we * can't test against it. */ ++entry; if (entry != 1) WT_RET(__verify_row_int_key_order( session, page, ref, entry, vs)); /* Verify the subtree. */ WT_RET(__wt_page_in(session, page, ref)); ret = __verify_tree(session, ref->page, vs); WT_TRET(__wt_page_release(session, ref->page)); WT_RET(ret); __wt_cell_unpack(ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); }